From 75503d3d098a981ec6eeb5b2f93bceaec9300034 Mon Sep 17 00:00:00 2001
From: weijinqian0 <1184188277@qq.com>
Date: Mon, 14 Jul 2025 14:29:15 +0800
Subject: [PATCH 01/56] [0.9.1][Feature]Moe alltoallv communication
 optimization for unquantized RL training sence & alltoallv support dpo
 (#1547)

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 requirements.txt                              |   2 +
 tests/ut/test_distributed_tensor_parallel.py  | 139 +++++
 tests/ut/test_token_dispatcher.py             |  69 +++
 vllm_ascend/ascend_forward_context.py         |   5 +
 vllm_ascend/attention/attention_v1.py         |  13 +
 vllm_ascend/distributed/tensor_parallel.py    | 248 ++++++++
 vllm_ascend/envs.py                           |  11 +-
 vllm_ascend/models/__init__.py                |  10 +-
 vllm_ascend/models/qwen3_dbo.py               | 552 +++++++++++++++++
 vllm_ascend/models/qwen3_moe.py               |   4 +
 vllm_ascend/multistream/ms_split.py           | 115 +++-
 vllm_ascend/ops/comm_utils.py                 | 127 ++++
 vllm_ascend/ops/fused_moe.py                  | 153 ++++-
 vllm_ascend/ops/moe_dispatcher/__init__.py    |   0
 .../ops/moe_dispatcher/token_dispatcher.py    | 578 ++++++++++++++++++
 15 files changed, 2004 insertions(+), 22 deletions(-)
 create mode 100644 tests/ut/test_distributed_tensor_parallel.py
 create mode 100644 tests/ut/test_token_dispatcher.py
 create mode 100644 vllm_ascend/distributed/tensor_parallel.py
 create mode 100644 vllm_ascend/models/qwen3_dbo.py
 create mode 100644 vllm_ascend/ops/comm_utils.py
 create mode 100644 vllm_ascend/ops/moe_dispatcher/__init__.py
 create mode 100644 vllm_ascend/ops/moe_dispatcher/token_dispatcher.py

diff --git a/requirements.txt b/requirements.txt
index c2b2a3175eb..effdf838b52 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,3 +27,5 @@ numba
 --pre
 --extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi
 torch-npu==2.5.1.post1.dev20250619
+
+pytest_mock
diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
new file mode 100644
index 00000000000..5a438e0cdf9
--- /dev/null
+++ b/tests/ut/test_distributed_tensor_parallel.py
@@ -0,0 +1,139 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+import importlib
+import unittest
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from vllm_ascend.distributed.tensor_parallel import (
+    _gather_along_first_dim, _gather_along_last_dim,
+    _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
+    all_to_all_hp2sp, all_to_all_sp2hp)
+
+
+@pytest.fixture
+def test_tensor():
+    return torch.randn(8, 16)
+
+
+@pytest.fixture
+def test_tensor_last_dim():
+    return torch.randn(8, 16, 32)
+
+
+@pytest.fixture
+def mock_group():
+    return MagicMock()
+
+
+@pytest.fixture(autouse=True)
+def mock_dist():
+    with patch("torch.distributed") as mock:
+        mock.get_world_size.return_value = 4
+        mock.get_rank.return_value = 0
+        yield mock
+
+
+class TestDistributedCommunication(unittest.TestCase):
+
+    @pytest.mark.parametrize("world_size", [1, 4])
+    def test_gather_along_first_dim(self, test_tensor, mock_group, mock_dist,
+                                    world_size):
+        """test _gather_along_first_dim"""
+        mock_dist.get_world_size.return_value = world_size
+
+        result = _gather_along_first_dim(test_tensor, mock_group)
+
+        if world_size == 1:
+            self.assertEqual(result.shape, (8, 16))
+        else:
+            self.assertEqual(result.shape, (32, 16))  # 8*4=32
+
+    def test_gather_along_first_dim_unequal_split(self, test_tensor,
+                                                  mock_group):
+        """test unequal split"""
+        output_split_sizes = [5, 10, 15, 2]
+        result = _gather_along_first_dim(test_tensor, mock_group,
+                                         output_split_sizes)
+        self.assertEqual(result.shape, (32, 16))  # 5+10+15+2=32
+
+    @pytest.mark.parametrize("world_size", [1, 4])
+    def test_gather_along_last_dim(self, test_tensor_last_dim, mock_group,
+                                   mock_dist, world_size):
+        """test _gather_along_last_dim"""
+        mock_dist.get_world_size.return_value = world_size
+
+        result = _gather_along_last_dim(test_tensor_last_dim, mock_group)
+
+        self.assertEqual(result.shape, (8, 16, 32 * world_size))
+
+    @pytest.mark.parametrize("input_shape,expected_shape", [
+        ((32, 16), (8, 16)),
+        ((40, 10), (10, 10)),
+    ])
+    def test_reduce_scatter_along_first_dim(self, mock_group, input_shape,
+                                            expected_shape):
+        input_tensor = torch.randn(*input_shape)
+        result = _reduce_scatter_along_first_dim(input_tensor, mock_group)
+        self.assertEqual(result.shape, expected_shape)
+
+    def test_reduce_scatter_along_last_dim(self, mock_group):
+        input_tensor = torch.randn(8, 16, 32)
+        result = _reduce_scatter_along_last_dim(input_tensor, mock_group)
+        self.assertEqual(result.shape, (8, 16, 8))
+
+    @pytest.mark.parametrize("func,input_shape,expected_shape", [
+        ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
+         (8, 16, 128)),
+        ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
+        ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
+         (8, 16, 8)),
+        ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
+    ])
+    def test_wrapper_functions(self, mock_group, func, input_shape,
+                               expected_shape):
+        """test wrapper funcs"""
+        mod = importlib.import_module(
+            'vllm_ascend.distributed.tensor_parallel')
+        globals = mod.__dict__
+        test_func = globals[func]
+        input_tensor = torch.randn(*input_shape)
+        result = test_func(input_tensor, mock_group)
+        self.assertEqual(result.shape, expected_shape)
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [
+            ((8, 16), (32, 4)),  # [num_tokens/TP, H] -> [num_tokens, H/TP]
+        ])
+    def test_all_to_all_sp2hp(self, mock_group, input_shape, output_shape):
+        input_tensor = torch.randn(*input_shape)
+        result = all_to_all_sp2hp(input_tensor, mock_group)
+        self.assertEqual(result.shape, output_shape)
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [
+            ((32, 4), (8, 16)),  # [num_tokens, H/TP] -> [num_tokens/TP, H]
+        ])
+    def test_all_to_all_hp2sp(self, mock_group, input_shape, output_shape):
+        input_tensor = torch.randn(*input_shape)
+        result = all_to_all_hp2sp(input_tensor, mock_group)
+        self.assertEqual(result.shape, output_shape)
diff --git a/tests/ut/test_token_dispatcher.py b/tests/ut/test_token_dispatcher.py
new file mode 100644
index 00000000000..18768a7fe8f
--- /dev/null
+++ b/tests/ut/test_token_dispatcher.py
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+import unittest
+
+import pytest
+from pytest_mock import MockerFixture
+
+from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
+    MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig)
+from vllm_ascend.utils import adapt_patch  # noqa E402
+
+import vllm_ascend.patch.worker.patch_common.patch_utils  # type: ignore[import]  # isort: skip  # noqa
+
+adapt_patch(True)
+
+
+class TestMoEAlltoAllSeqOverLapDispatcher(unittest.TestCase):
+
+    @pytest.fixture
+    def config(self):
+        config = MoEDispatcherConfig()
+        config.set_num_local_experts(2)
+        config.set_num_moe_experts(4)
+        config.set_moe_pad_expert_input_to_capacity(False)
+        config.set_moe_expert_capacity_factor(None)
+        config.set_moe_router_topk(2)
+        config.set_moe_grouped_gemm(False)
+        config.set_group_topk(0)
+        config.set_num_groups(1)
+        config.set_is_fused(False)
+        return config.build()
+
+    def mock_ep_group(self, mocker):
+        mock_group = mocker.MagicMock()
+        mock_group.rank_in_group = 0
+        mock_group.world_size = 2
+        mock_group.device_group = "mock_group"
+        return mock_group
+
+    @pytest.fixture
+    def dispatcher(self, config, mocker: MockerFixture):
+        mocker.patch(
+            "vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ep_group",
+            return_value=self.mock_ep_group(mocker))
+        return MoEAlltoAllSeqOverLapDispatcher(config)
+
+    def test_initialization(self, dispatcher, config):
+        self.assertEqual(dispatcher.num_local_experts,
+                         config.num_local_experts)
+        self.assertEqual(dispatcher.num_experts, config.num_moe_experts)
+        self.assertEqual(dispatcher.local_expert_indices, [0, 1])
+        self.assertEqual(dispatcher.ep_rank, 0)
+        self.assertEqual(dispatcher.ep_size, 2)
+        self.assertIsNotNone(dispatcher.overlap_stream)
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
index 83e4ee8fea1..2d080793f52 100644
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -18,6 +18,7 @@ class FusedMoEState(Enum):
     MC2 = 2
     AllGatherEP = 3
     NaiveMulticast = 4
+    All2AllSeq = 5
 
 
 # TODO(zzzzwwjj): add soc_version to choose branch
@@ -33,6 +34,10 @@ def get_fused_moe_state(ep_size: int, with_prefill: bool,
             return FusedMoEState.NaiveMulticast
         else:
             return FusedMoEState.AllGather
+    elif envs.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ:
+        # MC2 Dispatch/Combine performs better than alltoall_seq in decoding stage.
+        return (FusedMoEState.All2AllSeq if
+                (ep_size < 16 or with_prefill) else FusedMoEState.MC2)
     # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
     elif ep_size < 16 or with_prefill:
         return FusedMoEState.All2All
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 668c802c400..01b51e15607 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -28,6 +28,7 @@
 from vllm.utils import direct_register_custom_op
 from vllm.v1.core.sched.output import SchedulerOutput
 
+from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
                                nd_to_nz_2d, nd_to_nz_spec)
@@ -150,6 +151,18 @@ class AscendMetadata:
     # (num_tokens,)
     slot_mapping: torch.Tensor = None
 
+    def split_metadata_for_multistream(
+        self,
+        ms_split_config: MSAttentionMetadataSplitConfig,
+    ) -> list["AscendMetadata"]:
+        """Split metadata for multi-stream with AscendMetadata"""
+        from vllm_ascend.multistream.ms_split import model_input_split_v1_attn
+        return model_input_split_v1_attn(
+            ms_split_config=ms_split_config,
+            attn_metadata=self,
+            _metadata_cls=AscendMetadata,
+        )
+
 
 class AscendAttentionMetadataBuilder:
 
diff --git a/vllm_ascend/distributed/tensor_parallel.py b/vllm_ascend/distributed/tensor_parallel.py
new file mode 100644
index 00000000000..3fff0a7243f
--- /dev/null
+++ b/vllm_ascend/distributed/tensor_parallel.py
@@ -0,0 +1,248 @@
+# Copyright (c) 2024; NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapts from: Megatron/megatron/core/tensor_parallel/mappings.py.
+# This file is a part of the vllm-ascend project.
+import torch
+
+
+def _gather_along_first_dim(input_, group, output_split_sizes=None):
+    """Gather tensors and concatenate along the first dimension.
+
+    Args:
+        input_tensor (torch.Tensor):
+            A tensor to be gathered.
+        output_split_sizes (List[int], optional):
+            A list specifying the sizes of the output splits along the first dimension.
+            If None, equal splitting is assumed. Default: None.
+
+    Returns:
+        torch.Tensor: Gathered tensor.
+    """
+    world_size = torch.distributed.get_world_size(group)
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    if output_split_sizes is None:
+        dim_size[0] = dim_size[0] * world_size
+
+        output = torch.empty(dim_size,
+                             dtype=input_.dtype,
+                             device=torch.npu.current_device())
+        torch.distributed.all_gather_into_tensor(output,
+                                                 input_.contiguous(),
+                                                 group=group)
+    else:
+        dim_size[0] = sum(output_split_sizes)
+        output = torch.empty(dim_size,
+                             dtype=input_.dtype,
+                             device=torch.npu.current_device())
+        output_tensor_list = list(
+            torch.split(output, output_split_sizes, dim=0))
+        torch.distributed.all_gather(output_tensor_list, input_, group=group)
+
+    return output
+
+
+def _gather_along_last_dim(input_, group):
+    """Gather tensors and concatenate along the last dimension."""
+
+    world_size = torch.distributed.get_world_size(group)
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    dim_size[0] = dim_size[0] * world_size
+
+    output = torch.empty(dim_size,
+                         dtype=input_.dtype,
+                         device=torch.npu.current_device())
+    torch.distributed.all_gather_into_tensor(output,
+                                             input_.contiguous(),
+                                             group=group)
+    tensor_list = output.chunk(world_size, dim=0)
+    output = torch.cat(tensor_list, dim=-1).contiguous()
+
+    return output
+
+
+def _reduce_scatter_along_first_dim(input_,
+                                    group,
+                                    input_split_sizes=None,
+                                    use_global_buffer=False):
+    """Reduce-scatter the input tensor across model parallel group.
+
+    Args:
+        input_ (torch.Tensor): The input tensor to be reduce-scattered.
+        input_split_sizes (List[int], optional): A list specifying the sizes of
+            the input splits along the first dimension for each rank. If None,
+            equal splitting is assumed. Default: None.
+    """
+    world_size = torch.distributed.get_world_size(group)
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    if input_split_sizes is None:
+        dim_size = list(input_.size())
+        assert (
+            dim_size[0] % world_size == 0
+        ), "First dimension of the tensor should be divisible by tensor parallel size"
+
+        dim_size[0] = dim_size[0] // world_size
+
+        output = torch.empty(dim_size,
+                             dtype=input_.dtype,
+                             device=torch.npu.current_device())
+        torch.distributed.reduce_scatter_tensor(output,
+                                                input_.contiguous(),
+                                                group=group)
+    else:
+        rank = torch.distributed.get_rank(group)
+        input_tensor_list = list(torch.split(input_, input_split_sizes, dim=0))
+
+        output = torch.empty_like(input_tensor_list[rank])
+        torch.distributed.reduce_scatter(output,
+                                         input_tensor_list,
+                                         group=group)
+    return output
+
+
+def _reduce_scatter_along_last_dim(input_, group):
+    """Reduce-scatter tensors on the last dimension."""
+    world_size = torch.distributed.get_world_size(group)
+    target_shape = list(input_.size())
+    target_shape[-1] = target_shape[-1] // world_size
+    input_ = input_.reshape(-1, input_.shape[-1])
+    split_tensors = torch.split(input_,
+                                split_size_or_sections=input_.shape[-1] //
+                                world_size,
+                                dim=1)
+    concat_tensor = torch.cat(split_tensors, dim=0)
+    output = _reduce_scatter_along_first_dim(concat_tensor,
+                                             group).reshape(target_shape)
+    return output
+
+
+def all_gather_last_dim_from_tensor_parallel_region(input_, group):
+    """Wrapper for autograd function: forward: AG, backward RS <last dim>"""
+    return _gather_along_last_dim(input_, group)
+
+
+def reduce_scatter_to_sequence_parallel_region(input_,
+                                               group,
+                                               input_split_sizes=None):
+    """Wrapper for autograd function: forward: RS, backward AG <first dim>"""
+    return _reduce_scatter_along_first_dim(input_, group, input_split_sizes)
+
+
+def reduce_scatter_last_dim_to_tensor_parallel_region(input_, group):
+    """Wrapper for autograd function: forward: RS, backward AG: AG <last dim>"""
+    return _reduce_scatter_along_last_dim(input_, group)
+
+
+def gather_from_sequence_parallel_region(
+    input_,
+    group,
+    output_split_sizes=None,
+):
+    """Wrapper for autograd function: forward: AG, backward: RS <first dim>"""
+    return _gather_along_first_dim(input_, group, output_split_sizes)
+
+
+def all_to_all(group, input, output_split_sizes=None, input_split_sizes=None):
+    world_size = torch.distributed.get_world_size(group=group)
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input
+
+    input = input.contiguous()
+    if output_split_sizes is None:
+        # Equal split (all2all)
+        output = torch.empty_like(input)
+    else:
+        # Unequal split (all2all-v)
+        output = input.new_empty(
+            size=[sum(output_split_sizes)] + list(input.size()[1:]),
+            dtype=input.dtype,
+            device=torch.npu.current_device(),
+        )
+    torch.distributed.all_to_all_single(
+        output,
+        input,
+        output_split_sizes=output_split_sizes,
+        input_split_sizes=input_split_sizes,
+        group=group,
+    )
+    return output
+
+
+def all_to_all_sp2hp(input_, group):
+    """
+    Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape
+    [num_tokens/TP, H] to [num_tokens, H/TP].
+
+    Args:
+        input_ (torch.Tensor):
+            The input tensor which has been distributed along the sequence
+            dimension.
+
+    Returns:
+        torch.Tensor: The output tensor with shape [num_tokens, H/TP].
+
+    """
+    if group is None:
+        return input_
+    world_size = torch.distributed.get_world_size(group=group)
+    tp_group = group
+    input_ = input_.reshape(-1, input_.shape[-1])
+    split_tensors = torch.split(input_,
+                                split_size_or_sections=input_.shape[-1] //
+                                world_size,
+                                dim=1)
+    concat_tensor = torch.cat(split_tensors, dim=0)
+    output = all_to_all(tp_group, concat_tensor)
+    return output
+
+
+def all_to_all_hp2sp(input_, group):
+    """
+    Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape
+    [num_tokens, H/TP] to [num_tokens/TP, H].
+
+    Args:
+        input_ (torch.Tensor):
+            The input tensor which has been distributed along the hidden
+            dimension.
+
+    Returns:
+        torch.Tensor: The output tensor with shape [num_tokens/TP, H].
+    """
+    if group is None:
+        return input_
+    world_size = torch.distributed.get_world_size(group=group)
+    input_ = input_.reshape(-1, input_.shape[-1])
+    tp_group = group
+    input_exchanged = all_to_all(tp_group, input_)
+    input_reshaped = input_exchanged.reshape(-1, input_exchanged.shape[-1])
+    split_tensors = torch.split(
+        input_reshaped,
+        split_size_or_sections=input_reshaped.shape[0] // world_size,
+        dim=0)
+    output = torch.cat(split_tensors, dim=-1)
+    return output
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
index eb0223c7146..586b846c276 100644
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -107,11 +107,11 @@
     "VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE":
     lambda: bool(int(os.getenv("VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE", '0'))
                  ),
-    # MOE_ALL2ALL_BUFFER:
+    # VLLM_ASCEND_MOE_ALL2ALL_BUFFER:
     #   0: default, normal init.
     #   1: enable moe_all2all_buffer.
-    "MOE_ALL2ALL_BUFFER":
-    lambda: bool(int(os.getenv("MOE_ALL2ALL_BUFFER", '0'))),
+    "VLLM_ASCEND_MOE_ALL2ALL_BUFFER":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_MOE_ALL2ALL_BUFFER", '0'))),
     # Some models are optimized by vllm ascend. While in some case, e.g. rlhf
     # training, the optimized model may not be suitable. In this case, set this
     # value to False to disable the optimized model.
@@ -159,6 +159,11 @@
     # this feature is supported in A2, and eager mode will get better performance.
     "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE":
     lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", '0'))),
+    # VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ:
+    #   0: default, normal init.
+    #   1: enable moe all2all seq.
+    "VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ":
+    lambda: bool(int(os.getenv('VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ', '0'))),
 }
 
 # end-env-vars-definition
diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py
index 0b1b67a4f19..e3609f802d7 100644
--- a/vllm_ascend/models/__init__.py
+++ b/vllm_ascend/models/__init__.py
@@ -41,6 +41,10 @@ def register_model():
             "DeepseekV3ForCausalLM",
             "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM")
 
+        ModelRegistry.register_model(
+            "Qwen3MoeForCausalLM",
+            "vllm_ascend.models.qwen3_dbo:CustomQwen3MoeForCausalLMDBO")
+
     else:
         ModelRegistry.register_model(
             "DeepseekV2ForCausalLM",
@@ -50,9 +54,9 @@ def register_model():
             "DeepseekV3ForCausalLM",
             "vllm_ascend.models.deepseek_v3:CustomDeepseekV3ForCausalLM")
 
-    ModelRegistry.register_model(
-        "Qwen3MoeForCausalLM",
-        "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM")
+        ModelRegistry.register_model(
+            "Qwen3MoeForCausalLM",
+            "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM")
 
     ModelRegistry.register_model(
         "Qwen3ForCausalLM", "vllm_ascend.models.qwen3:CustomQwen3ForCausalLM")
diff --git a/vllm_ascend/models/qwen3_dbo.py b/vllm_ascend/models/qwen3_dbo.py
new file mode 100644
index 00000000000..fa87fe81f22
--- /dev/null
+++ b/vllm_ascend/models/qwen3_dbo.py
@@ -0,0 +1,552 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+# """Inference-only Qwen3 model."""
+from types import SimpleNamespace
+from typing import List, Optional, Union
+
+import torch
+import torch_npu
+import vllm.model_executor.models.qwen3_moe as qwen3
+from torch import nn
+from transformers import PretrainedConfig
+from vllm.attention import AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              get_tp_group)
+from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.models.qwen3_moe import (Qwen3MoeDecoderLayer,
+                                                  Qwen3MoeForCausalLM,
+                                                  Qwen3MoeModel)
+from vllm.model_executor.models.utils import (
+    make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
+from vllm.sequence import IntermediateTensors
+
+import vllm_ascend.envs as envs_ascend
+from vllm_ascend.distributed.tensor_parallel import \
+    gather_from_sequence_parallel_region
+from vllm_ascend.multistream.base import MSEventKey
+from vllm_ascend.multistream.context import (
+    advance_step_multistream_layer_context, get_multistream_layer_context)
+from vllm_ascend.multistream.layers import (MultiStreamPostTransformerLayer,
+                                            MultiStreamPreTransformerLayer)
+from vllm_ascend.multistream.metadata import (MultiStreamConfig,
+                                              MultiStreamStepMetadata,
+                                              make_multistream_metadata_ds)
+from vllm_ascend.ops.fused_moe import (AscendSparseMoeBlock, apply_mlp,
+                                       select_experts)
+
+VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO
+
+
+class Qwen3MoeDecoderLayerDBO(Qwen3MoeDecoderLayer):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super(Qwen3MoeDecoderLayerDBO, self).__init__(config, cache_config,
+                                                      quant_config, prefix)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tp_group().rank_in_group
+        self.tp_group = get_tp_group().device_group
+        self.dummy_vllm_config = SimpleNamespace(
+            parallel_config=SimpleNamespace(data_parallel_size=1, ),
+            compilation_config=SimpleNamespace(static_forward_context=None, ),
+            other_setting="value",
+        )
+        self.config = config
+
+    def forward(self, *args, **kwargs):
+        return super().forward(*args, **kwargs)
+
+    # should split ops in Decoder Layer
+    def _forward_ms_op_input_layernorm(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        return hidden_states, residual
+
+    def _forward_ms_op_attn(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        kv_cache: Optional[torch.Tensor] = None,
+        attn_metadata: Optional[AttentionMetadata] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        self.dummy_vllm_config.compilation_config.static_forward_context = (
+            get_forward_context().no_compile_layers)
+        with set_forward_context(attn_metadata, self.dummy_vllm_config):
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+            )
+        if hidden_states.dtype == torch.float16:
+            # Fix FP16 overflow
+            # We scale both hidden_states and residual before
+            # rmsnorm, and rmsnorm result would not affect by scale.
+            hidden_states *= 1.0 / self.routed_scaling_factor
+            if self.layer_idx == 0:
+                # The residual is shared by all layers, we only scale it on
+                # first layer.
+                residual *= 1.0 / self.routed_scaling_factor
+        return hidden_states, residual
+
+    def _forward_ms_op_post_attn_layernorm(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ):
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        return hidden_states, residual
+
+    def _forward_op_gating(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: Optional[AttentionMetadata] = None,
+    ) -> torch.Tensor:
+        if attn_metadata is None:
+            attn_metadata = get_forward_context().attn_metadata
+        # when profile runs, force experts to load balanced tokens
+        # to avoid high memory consumption on a single rank.
+        enable_force_load_balance = get_forward_context().in_profile_run
+
+        num_tokens, hidden_dim = hidden_states.shape
+
+        if self.tp_size > 1:
+            # pass
+            num_tokens, hidden_size = hidden_states.shape
+            if num_tokens < self.tp_size:
+                hidden_states = nn.functional.pad(
+                    hidden_states, (0, 0, 0, self.tp_size - num_tokens))
+            chunk_hidden_states = torch.tensor_split(hidden_states,
+                                                     self.tp_size,
+                                                     dim=0)
+            chunked_hidden_states_sizes = [
+                x.shape[0] for x in chunk_hidden_states
+            ]
+            local_hidden_states = chunk_hidden_states[self.tp_rank]
+        else:
+            local_hidden_states = hidden_states
+            chunked_hidden_states_sizes = None
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.mlp.gate(local_hidden_states)
+
+        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
+        mlp_config = self.config
+        if mlp_config.num_experts == 256:
+            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
+                router_logits,
+                k=mlp_config.num_experts_per_tok,  # topk当前写8
+                bias=self.mlp.gate.e_score_correction_bias,
+                k_group=mlp_config.topk_group,  # fix: 4
+                group_count=mlp_config.n_group,  # fix 8
+                group_select_mode=1,  # 0: max in group; 1: topk2.sum(fix)
+                renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
+                norm_type=1,  # 0: softmax; 1: sigmoid(fix)
+                routed_scaling_factor=1,
+                eps=float(1e-20),
+            )
+        else:
+            topk_weights, topk_ids = select_experts(
+                hidden_states=local_hidden_states,
+                router_logits=router_logits,
+                top_k=mlp_config.num_experts_per_tok,
+                use_grouped_topk=False,
+                renormalize=mlp_config.norm_topk_prob,
+                topk_group=getattr(mlp_config, "topk_group", None),
+                num_expert_group=getattr(mlp_config, "n_group", None),
+                custom_routing_function=None,
+                scoring_func=getattr(mlp_config, "scoring_func", "softmax"),
+                e_score_correction_bias=getattr(self.mlp.gate,
+                                                "e_score_correction_bias",
+                                                None),
+            )
+
+        topk_weights = topk_weights.to(hidden_states.dtype)
+        # this is a naive implementation for experts load balance so as
+        # to avoid accumulating too much tokens on a single rank.
+        # currently it is only activated when doing profile runs.
+        if enable_force_load_balance:
+            topk_ids = torch.randint_like(topk_ids, 0, self.config.num_experts)
+
+        return topk_weights, topk_ids, local_hidden_states, chunked_hidden_states_sizes
+
+    def _forward_op_grouped_mlp(self, dispatched_input, tokens_per_expert):
+        return apply_mlp(
+            dispatched_input,
+            self.mlp.experts.w13_weight,
+            self.mlp.experts.w2_weight,
+            tokens_per_expert,
+        )
+
+    def _forward_combine_comm(self, hidden_states, microbatch_id, num_tokens,
+                              chunked_hidden_states_sizes):
+        token_dispatcher = self.mlp.experts.token_dispatchers[microbatch_id]
+        final_hidden_states, _ = token_dispatcher.token_unpermutation(
+            hidden_states)
+        if hasattr(self.mlp, "routed_scaling_factor"):
+            final_hidden_states = final_hidden_states * self.mlp.routed_scaling_factor
+
+        if self.tp_size > 1:
+            final_hidden_states = gather_from_sequence_parallel_region(
+                final_hidden_states, self.tp_group,
+                chunked_hidden_states_sizes)
+            if num_tokens < self.tp_size:
+                final_hidden_states = final_hidden_states[:num_tokens]
+
+        if hasattr(self.mlp, "shared_experts"):
+            final_hidden_states = (
+                final_hidden_states +
+                token_dispatcher.cached_shared_expert_output)
+            token_dispatcher.cached_shared_expert_output.untyped_storage(
+            ).resize_(0)
+            token_dispatcher.cached_shared_expert_output = None
+
+        final_hidden_states = final_hidden_states.view(num_tokens, -1)
+
+        return final_hidden_states
+
+    def _forward_ms_layer_alltoallv_finegrained(
+        self,
+        positions: List[torch.Tensor],
+        hidden_states: List[torch.Tensor],
+        residual: List[torch.Tensor],
+        attn_metadata: List[AttentionMetadata],
+        kv_cache: Optional[torch.Tensor] = None,
+    ):
+        layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
+        )
+        assert layer_index >= 0 and ms_metadata is not None
+        num_micro_batchs = ms_metadata.ms_config.num_micro_batches
+        assert len(positions) == num_micro_batchs
+        assert len(hidden_states) == num_micro_batchs
+        assert residual is not None
+        assert attn_metadata is not None
+        num_tokens = [None] * num_micro_batchs
+        hidden_dims = [None] * num_micro_batchs
+        topk_weights, topk_ids = [None] * num_micro_batchs, [
+            None
+        ] * num_micro_batchs
+        tokens_per_expert = [None] * num_micro_batchs
+        dispatched_input = [None] * num_micro_batchs
+        router_expert_output = [None] * num_micro_batchs
+        chunked_hidden_states_sizes = [None] * num_micro_batchs
+        token_dispatchers = self.mlp.experts.token_dispatchers
+
+        def discard_tensor(tensor):
+            if isinstance(tensor, torch.Tensor):
+                tensor = [tensor]
+            for t in tensor:
+                t.untyped_storage().resize_(0)
+
+        # block 1 : attention
+        # block 2 : Router Gating
+        # block 3 : Token DisPatch
+        # the attn computation of microbatch 1 can be overlapped with the moe
+        # communication in the previous layer, and the attn computation of microbatch 2
+        # can be overlapped with the attn communication of microbatch 1
+        for i in range(num_micro_batchs):
+            forward_context = get_forward_context()
+            layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
+            )
+            ms_metadata.try_wait_event(layer_index - 1, i,
+                                       MSEventKey.FFN_AR_FINISH)
+            forward_context.attn_metadata = attn_metadata[i]
+
+            # input layernorm
+            hidden_states[i], residual[
+                i] = self._forward_ms_op_input_layernorm(
+                    hidden_states[i], residual[i])
+            # attention and tp allreduce
+            hidden_states[i], residual[i] = self._forward_ms_op_attn(
+                positions[i], hidden_states[i], residual[i], kv_cache,
+                attn_metadata[i])
+            # post attention layer norm
+            hidden_states[i], residual[
+                i] = self._forward_ms_op_post_attn_layernorm(
+                    hidden_states[i], residual[i])
+            num_tokens[i], hidden_dims[i] = hidden_states[i].shape
+            # If TP is enabled, hidden_states will be chunked.
+            (
+                topk_weights[i],
+                topk_ids[i],
+                dispatched_input[i],
+                chunked_hidden_states_sizes[i],
+            ) = self._forward_op_gating(hidden_states[i], attn_metadata[i])
+            token_dispatchers[i].preprocess_and_permtute1(
+                dispatched_input[i],
+                topk_weights[i],
+                topk_ids[i],
+                shared_experts=None,
+                shared_experts_input=None,
+            )
+            # Launch DisPatch Comm in a New Stream.
+            dispatch_context = MultiStreamStepMetadata(
+                comm_stream=ms_metadata.communicate_stream,
+                before_comm_event=ms_metadata.ms_events[layer_index][i][
+                    MSEventKey.MOE_BEFORE_COMM],
+                after_comm_event=ms_metadata.ms_events[layer_index][i][
+                    MSEventKey.MOE_AFTER_COMM],
+            )
+            dispatch_context.before_comm_event.record()
+            # print_with_sync(f'begin token dispatch{i}...', torch.distributed.get_rank())
+            with torch.npu.stream(dispatch_context.comm_stream):
+                dispatch_context.comm_stream.wait_event(
+                    dispatch_context.before_comm_event)
+                token_dispatchers[i].dispatch_alltoall()
+                dispatched_input[i], tokens_per_expert[i] = token_dispatchers[
+                    i].permute2()
+                dispatch_context.after_comm_event.record()
+
+        # print_with_sync('begin experts...', torch.distributed.get_rank())
+        # block 4 : Router Experts Computation
+        # block 5 : Token Combine Communication
+        for i in range(num_micro_batchs):
+            ms_metadata.try_wait_event(layer_index, i,
+                                       MSEventKey.MOE_AFTER_COMM)
+            discard_tensor(hidden_states[i])
+            router_expert_output[i] = self._forward_op_grouped_mlp(
+                dispatched_input[i], tokens_per_expert[i])
+            discard_tensor(dispatched_input[i])
+
+            # Launch Combine Comm in a New Stream.
+            combine_context = MultiStreamStepMetadata(
+                comm_stream=ms_metadata.communicate_stream,
+                before_comm_event=ms_metadata.ms_events[layer_index][i][
+                    MSEventKey.FFN_COM_FINISH],
+                after_comm_event=ms_metadata.ms_events[layer_index][i][
+                    MSEventKey.FFN_AR_FINISH],
+            )
+            combine_context.before_comm_event.record()
+            ms_metadata.try_wait_event(layer_index, i,
+                                       MSEventKey.MOE_SE_COMM_FINISH)
+            with torch.npu.stream(combine_context.comm_stream):
+                combine_context.comm_stream.wait_event(
+                    combine_context.before_comm_event)
+                hidden_states[i] = self._forward_combine_comm(
+                    router_expert_output[i],
+                    i,
+                    num_tokens[i],
+                    chunked_hidden_states_sizes[i],
+                )
+                ms_metadata.ms_events[layer_index][i][
+                    MSEventKey.
+                    FFN_AR_FINISH] = combine_context.comm_stream.record_event(
+                    )
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class CustomQwen3DBOMoEModel(Qwen3MoeModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=f"{prefix}.embed_tokens")
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Qwen3MoeDecoderLayerDBO(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size)
+
+        # dbo related members
+        if VLLM_ASCEND_ENABLE_DBO:
+            self.use_mla = False
+            self.multistream_config = MultiStreamConfig()
+            multistream_metadata = make_multistream_metadata_ds(
+                start_layer=self.start_layer,
+                end_layer=self.end_layer,
+                causal_lm=getattr(config, "causal_lm", True),
+                multistream_config=self.multistream_config,
+            )
+            self.ms_pre_layer = MultiStreamPreTransformerLayer(
+                multistream_metadata)
+            self.ms_post_layer = MultiStreamPostTransformerLayer(
+                multistream_metadata)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        num_normal_layers = (0 if VLLM_ASCEND_ENABLE_DBO and self.can_run_ms()
+                             else self.end_layer - self.start_layer)
+
+        moe_start_layer = self.start_layer + num_normal_layers
+        for i in range(self.start_layer, min(moe_start_layer, self.end_layer)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if moe_start_layer < self.end_layer:
+            # if we enable multistream/dbo, process sparse layers here
+            hidden_states, residual = self._forward_ms_layers(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                moe_start_layer=moe_start_layer,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def can_run_ms(self):
+        attn_metadata = get_forward_context().attn_metadata
+        # enable prefill overlap
+        with_prefill = get_forward_context().with_prefill
+        if (attn_metadata is None or not with_prefill
+                or not attn_metadata.enable_dbo_across_dp):
+            return False
+
+        return True
+
+    def _forward_ms_layers(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        moe_start_layer: int,
+        kv_caches: Optional[List[torch.Tensor]] = None,
+    ):
+
+        if moe_start_layer == self.end_layer:
+            return hidden_states, residual
+
+        attn_metadata, [positions, hidden_states,
+                        residual] = self.ms_pre_layer(
+                            [positions, hidden_states, residual], )
+        num_micro_batch = len(attn_metadata)
+        # the rest layers
+        for i in range(moe_start_layer, self.end_layer):
+            layer = self.layers[i]
+            ms_layer_forward_func = layer._forward_ms_layer_alltoallv_finegrained
+            # print("get_called......")
+            hidden_states, residual = ms_layer_forward_func(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                attn_metadata=attn_metadata,
+            )
+            advance_step_multistream_layer_context()
+
+        layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
+        )
+        for i in range(num_micro_batch):
+            ms_metadata.try_wait_event(layer_index - 1, i,
+                                       MSEventKey.FFN_AR_FINISH)
+
+        [hidden_states,
+         residual] = self.ms_post_layer([hidden_states, residual], )
+        return hidden_states, residual
+
+
+class CustomQwen3MoeForCausalLMDBO(Qwen3MoeForCausalLM):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+    }
+    qwen3.Qwen3MoeSparseMoeBlock = AscendSparseMoeBlock
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = CustomQwen3DBOMoEModel(vllm_config=vllm_config,
+                                            prefix=maybe_prefix(
+                                                prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def forward(self, *args, **kwargs):
+        if "graph_enable" in kwargs:
+            kwargs.pop("graph_enable")
+        return super().forward(*args, **kwargs)
diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index 8ff1b52a7ad..485e5ca92fc 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -16,8 +16,11 @@
 # Adapted from vllm/model_executor/models/qwen3_moe.py
 # This file is a part of the vllm-ascend project.
 
+import vllm.model_executor.models.qwen3_moe as qwen3
 from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
 
+from vllm_ascend.ops.fused_moe import AscendSparseMoeBlock
+
 
 class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
     packed_modules_mapping = {
@@ -33,3 +36,4 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
         "experts":
         ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
     }
+    qwen3.Qwen3MoeSparseMoeBlock = AscendSparseMoeBlock
diff --git a/vllm_ascend/multistream/ms_split.py b/vllm_ascend/multistream/ms_split.py
index 3af6337e473..684f6aea136 100644
--- a/vllm_ascend/multistream/ms_split.py
+++ b/vllm_ascend/multistream/ms_split.py
@@ -4,7 +4,8 @@
 import numpy as np
 import torch
 
-from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
+                                                AscendMetadata)
 
 from .base import MSAttentionMetadataSplitConfig
 
@@ -241,3 +242,115 @@ def model_input_split_v1_mla_attn(
         decode=decode_post,
     )
     return [attention_metadata_pre, attention_metadata_post]
+
+
+def model_input_split_v1_attn(
+    attn_metadata: AscendMetadata,
+    _metadata_cls,
+    ms_split_config: MSAttentionMetadataSplitConfig,
+) -> List[Any]:
+    assert 0 < ms_split_config.num_micro_batches < 3
+    if attn_metadata is None:
+        return [attn_metadata]
+    [token_index,
+     seq_index] = compute_split_seq_index(attn_metadata.query_lens,
+                                          attn_metadata.attn_state,
+                                          attn_metadata.num_actual_tokens)
+    if token_index == 0 or seq_index == 0 or seq_index == len(
+            attn_metadata.query_lens):
+        return [attn_metadata]
+
+    # split attn metadata
+
+    [block_table_pre,
+     block_table_post] = split_attn_tensor_type(attn_metadata.block_tables,
+                                                seq_index)
+
+    query_start_loc_pre = query_start_loc_post = None
+    if attn_metadata.query_start_loc is not None:
+        query_start_loc_pre = attn_metadata.query_start_loc[:seq_index + 1]
+        query_start_loc_post = deepcopy(
+            attn_metadata.query_start_loc[seq_index:]
+        ) - attn_metadata.query_start_loc[seq_index]
+
+    [query_lens_pre,
+     query_lens_post] = split_attn_tensor_type(attn_metadata.query_lens,
+                                               seq_index)
+    [seq_lens_pre,
+     seq_lens_post] = split_attn_tensor_type(attn_metadata.seq_lens, seq_index)
+
+    max_query_len_pre = max_query_len_post = None
+    if attn_metadata.max_query_len is not None:
+        max_query_len_pre, max_query_len_post = max(query_lens_pre), max(
+            query_lens_post)
+
+    [slot_mapping_pre,
+     slot_mapping_post] = split_attn_tensor_type(attn_metadata.slot_mapping,
+                                                 token_index)
+
+    is_only_prefill_pre = is_only_prefill_post = attn_metadata.is_only_prefill
+    has_prefill_pre, _ = torch.any(query_lens_pre > 1).item(), torch.any(
+        query_lens_post > 1).item()
+
+    if not attn_metadata.is_only_prefill:
+        is_only_prefill_post = torch.all(query_lens_post > 1).item()
+
+    if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache or attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
+        # the attn_mla kernel in torch npu only accept 128*128 attn mask
+        attn_mask_pre = attn_mask_post = attn_metadata.attn_mask
+        attn_state_pre = attn_state_post = attn_metadata.attn_state
+    elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
+        # should be none in decode only state
+        attn_mask_pre = attn_mask_post = attn_metadata.attn_mask
+        attn_state_pre = attn_state_post = AscendAttentionState.DecodeOnly  # type: ignore
+    else:
+        # chunked prefill
+        assert attn_metadata.attn_mask is not None
+        if has_prefill_pre:
+            attn_state_pre = attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
+            attn_mask_pre = attn_metadata.attn_mask[:token_index, :max(
+                seq_lens_pre)].contiguous()
+            attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
+            attn_mask_post = attn_metadata.attn_mask[
+                token_index:, :max(seq_lens_post)].contiguous()
+        else:
+            attn_state_pre = AscendAttentionState.DecodeOnly  # type: ignore
+            attn_mask_pre = None
+            attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
+            attn_mask_post = attn_metadata.attn_mask[
+                token_index:, :max(seq_lens_post)].contiguous()
+
+    # construct metadata
+    attention_metadata_pre = _metadata_cls(
+        num_actual_tokens=token_index,
+        block_tables=block_table_pre,
+        query_start_loc=query_start_loc_pre,
+        query_lens=query_lens_pre,
+        seq_lens=seq_lens_pre,
+        seq_lens_list=seq_lens_pre.tolist(),
+        max_query_len=max_query_len_pre,
+        slot_mapping=slot_mapping_pre,
+        is_only_prefill=is_only_prefill_pre,
+        attn_state=attn_state_pre,
+        attn_mask=attn_mask_pre,
+        num_input_tokens=token_index,
+        enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp,
+    )
+
+    attention_metadata_post = _metadata_cls(
+        num_actual_tokens=attn_metadata.num_actual_tokens - token_index,
+        block_tables=block_table_post,
+        query_start_loc=query_start_loc_post,
+        query_lens=query_lens_post,
+        seq_lens=seq_lens_post,
+        seq_lens_list=seq_lens_post.tolist(),
+        max_query_len=max_query_len_post,
+        slot_mapping=slot_mapping_post,
+        is_only_prefill=is_only_prefill_post,
+        attn_state=attn_state_post,
+        attn_mask=attn_mask_post,
+        num_input_tokens=attn_metadata.num_input_tokens - token_index,
+        enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp,
+    )
+
+    return [attention_metadata_pre, attention_metadata_post]
diff --git a/vllm_ascend/ops/comm_utils.py b/vllm_ascend/ops/comm_utils.py
new file mode 100644
index 00000000000..6c43773308b
--- /dev/null
+++ b/vllm_ascend/ops/comm_utils.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+import torch
+import torch.distributed
+import torch.distributed as dist
+import torch_npu
+
+COMM_STREAM = None
+
+
+def async_all_gather(input_,
+                     group,
+                     event=None,
+                     is_use_get_global_memory_buffer=False):
+    world_size = torch.distributed.get_world_size(group)
+    dim_size = list(input_.size())
+    new_dim_size = dim_size[0] * world_size
+    dim_size[0] = new_dim_size
+
+    ag_out = torch.empty(dim_size,
+                         dtype=input_.dtype,
+                         device=torch.npu.current_device())
+    if event:
+        # multi stream wait event
+        global COMM_STREAM
+        if COMM_STREAM is None:
+            COMM_STREAM = torch_npu.npu.Stream(
+                device=torch.npu.current_device())
+        with torch_npu.npu.stream(COMM_STREAM):
+            event.wait()
+            handle = torch.distributed._all_gather_base(ag_out,
+                                                        input_.contiguous(),
+                                                        group=group,
+                                                        async_op=True)
+    else:
+        handle = torch.distributed._all_gather_base(ag_out,
+                                                    input_.contiguous(),
+                                                    group=group,
+                                                    async_op=True)
+    return input_, ag_out, handle
+
+
+def async_reduce_scatter(input_,
+                         group,
+                         event=None,
+                         stream=None,
+                         is_use_get_global_memory_buffer=False):
+    world_size = dist.get_world_size(group)
+    dim_size = list(input_.size())
+    dim_size[0] = dim_size[0] // world_size
+
+    rs_out = torch.empty(dim_size,
+                         dtype=input_.dtype,
+                         device=torch.npu.current_device())
+    if event or stream:
+        # multi stream wait event
+        global COMM_STREAM
+        if COMM_STREAM is None:
+            COMM_STREAM = torch_npu.npu.Stream(
+                device=torch.npu.current_device())
+        with torch_npu.npu.stream(COMM_STREAM):
+            if event:
+                event.wait()
+            if stream:
+                torch.npu.current_stream().wait_stream(stream)
+            handle = torch.distributed.reduce_scatter_tensor(
+                rs_out, input_.contiguous(), group=group, async_op=True)
+    else:
+        handle = torch.distributed.reduce_scatter_tensor(rs_out,
+                                                         input_.contiguous(),
+                                                         group=group,
+                                                         async_op=True)
+    return input_, rs_out, handle
+
+
+def async_all_to_all(input_,
+                     output_split_sizes,
+                     input_split_sizes,
+                     group,
+                     event=None):
+    if output_split_sizes is None:
+        # Equal split (all2all)
+        a2a_out = torch.empty_like(input_)
+    else:
+        # Unequal split (all2all-v)
+        a2a_out = input_.new_empty(
+            size=[sum(output_split_sizes)] + list(input_.size()[1:]),
+            dtype=input_.dtype,
+            device=torch.npu.current_device(),
+        )
+
+    if event:
+        # multi stream wait event
+        global COMM_STREAM
+        if COMM_STREAM is None:
+            COMM_STREAM = torch_npu.npu.Stream(
+                device=torch.npu.current_device())
+        with torch_npu.npu.stream(COMM_STREAM):
+            event.wait()
+            handle = dist.all_to_all_single(
+                a2a_out,
+                input_.contiguous(),
+                output_split_sizes=output_split_sizes,
+                input_split_sizes=input_split_sizes,
+                group=group,
+                async_op=True)
+    else:
+        handle = dist.all_to_all_single(a2a_out,
+                                        input_.contiguous(),
+                                        output_split_sizes=output_split_sizes,
+                                        input_split_sizes=input_split_sizes,
+                                        group=group,
+                                        async_op=True)
+    return input_, a2a_out, handle
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index 61205ffea2e..9b9e1a9fc7d 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -51,7 +51,7 @@
                                get_ascend_soc_version,
                                get_rm_router_logits_state, is_310p)
 
-MOE_ALL2ALL_BUFFER: bool = envs_ascend.MOE_ALL2ALL_BUFFER
+VLLM_ASCEND_MOE_ALL2ALL_BUFFER: bool = envs_ascend.VLLM_ASCEND_MOE_ALL2ALL_BUFFER
 SELECT_GATING_TOPK_SOTFMAX_EXPERTS: bool = envs_ascend.SELECT_GATING_TOPK_SOTFMAX_EXPERTS
 
 
@@ -274,11 +274,13 @@ def fused_experts_with_mc2(
         return hidden_states, shared_hidden_states
 
 
-def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
-              w1: torch.Tensor,
-              w2: torch.Tensor,
-              group_list: torch.Tensor,
-              group_list_type: int = 1) -> torch.Tensor:
+def apply_mlp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    group_list: torch.Tensor,
+    group_list_type: int = 1,
+) -> torch.Tensor:
     """
     apply MLP: gate_up_proj -> swiglu -> down_proj
 
@@ -300,9 +302,6 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
         hidden_states: output hidden states after MLP.
     """
 
-    assert len(hidden_states_wrapper) == 1
-    hidden_states = hidden_states_wrapper.pop()
-
     w1 = w1.transpose(1, 2)
     hidden_states = torch_npu.npu_grouped_matmul(
         x=[hidden_states],
@@ -330,6 +329,8 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
     return hidden_states
 
 
+# currently expert parallelism implemented with all2all
+# is under-optimized.
 def fused_experts_with_all2all(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -544,10 +545,7 @@ def fused_experts_with_all2all_buffer(
     hidden_states = hidden_states[sorted_idx]
     group_list_type = 0
 
-    hidden_states_wrapper = [hidden_states]
-    del hidden_states
-
-    hidden_states = apply_mlp(hidden_states_wrapper,
+    hidden_states = apply_mlp(hidden_states,
                               w1,
                               w2,
                               expert_tokens,
@@ -683,6 +681,24 @@ def fused_experts_moge(
     return final_hidden_states
 
 
+def fused_experts_with_all2allv(
+    token_dispatcher,
+    probs,
+    routing_map,
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+):
+    # Enable moe alltoallv, it's a balanced policy for precision and efficiency.
+    (share_experts_output, dispatched_input,
+     tokens_per_expert) = (token_dispatcher.token_permutation(
+         hidden_states, probs, routing_map))
+
+    expert_output = apply_mlp(dispatched_input, w1, w2, tokens_per_expert)
+    output, mlp_bias = token_dispatcher.token_unpermutation(expert_output)
+    return output
+
+
 def fused_experts(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -1133,7 +1149,7 @@ def apply(
                                  topk_ids=topk_ids,
                                  top_k=top_k,
                                  expert_map=expert_map)
-        elif MOE_ALL2ALL_BUFFER:
+        elif VLLM_ASCEND_MOE_ALL2ALL_BUFFER:
             return fused_experts_with_all2all_buffer(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -1145,6 +1161,16 @@ def apply(
                 global_batch_size=self.global_batch_size,
                 expert_map=expert_map,
                 ep_group=get_ep_group())
+        elif fused_moe_state == FusedMoEState.All2AllSeq:
+            token_dispatcher = kwargs.get("token_dispatcher")
+            return fused_experts_with_all2allv(
+                token_dispatcher=token_dispatcher,
+                probs=topk_weights,
+                routing_map=topk_ids,
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+            )
         else:
             return fused_experts_with_all2all(hidden_states=x,
                                               w1=layer.w13_weight,
@@ -1295,6 +1321,25 @@ def __init__(
         # NOTE: self.tp_group is not expert_tp_group
         self.tp_group = get_tp_group().device_group
         self.quant_method.create_weights(layer=self, **moe_quant_params)
+        self.token_dispatcher = None
+        if envs_ascend.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ and isinstance(
+                self.quant_method, AscendUnquantizedFusedMoEMethod):
+            self.reduce_results = False
+            moe_dispatcher_config = (
+                MoEDispatcherConfig().set_num_moe_experts(
+                    self.global_num_experts).set_num_local_experts(
+                        self.local_num_experts).set_moe_router_topk(
+                            top_k).set_group_topk(topk_group).
+                set_num_groups(num_expert_group).set_expert_bias(
+                    e_score_correction_bias).set_scaling_factor(1.0).build())
+            self.token_dispatcher = MoEAlltoAllSeqOverLapDispatcher(
+                moe_dispatcher_config)
+            if envs_ascend.VLLM_ASCEND_ENABLE_DBO:
+                token_dispatcher1 = MoEAlltoAllSeqOverLapDispatcher(
+                    moe_dispatcher_config)
+                self.token_dispatchers = [
+                    self.token_dispatcher, token_dispatcher1
+                ]
 
     def naive_multicast(self, x: torch.Tensor,
                         cu_tokens_across_dp_cpu: torch.Tensor):
@@ -1419,6 +1464,7 @@ def forward(self,
             shared_experts=shared_experts if self.torchair_graph_enabled
             and self.enable_multistream_moe and not is_prefill else None,
             mc2_mask=mc2_mask,
+            token_dispatcher=self.token_dispatcher,
         )
 
         if shared_experts:
@@ -1492,6 +1538,83 @@ def _forward_ms_fused_moe_comp(
             scoring_func=self.scoring_func,
             e_score_correction_bias=self.e_score_correction_bias,
             is_prefill=is_prefill,
-            enable_force_load_balance=enable_force_load_balance)
+            enable_force_load_balance=enable_force_load_balance,
+        )
+
+        return hidden_states
+
+
+class AscendSparseMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}.")
+
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_multistream_moe = (
+            ascend_config.torchair_graph_config.enable_multistream_moe)
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.experts = AscendFusedMoE(
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+        )
+
+        self.top_k = config.num_experts_per_tok
+
+        self.dp_size = get_dp_group().world_size
+
+        self.tp_group = get_tp_group().device_group
+        self.tp_rank = get_tp_group().rank_in_group
+        self.ep_group = get_ep_group()
+
+        self.params_dtype = torch.get_default_dtype()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: Optional[AttentionMetadata] = None,
+    ) -> torch.Tensor:
+        if attn_metadata is None:
+            attn_metadata = get_forward_context().attn_metadata
+        # when profile runs, force experts to load balanced tokens
+        # to avoid high memory consumption on a single rank.
+        enable_force_load_balance = get_forward_context().in_profile_run
+        is_prefill = get_forward_context().with_prefill
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+
+        hidden_states = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            is_prefill=is_prefill,
+            top_k=self.top_k,
+            enable_force_load_balance=enable_force_load_balance,
+            shared_experts=None,
+        )
 
         return hidden_states
diff --git a/vllm_ascend/ops/moe_dispatcher/__init__.py b/vllm_ascend/ops/moe_dispatcher/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py b/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py
new file mode 100644
index 00000000000..91118e296de
--- /dev/null
+++ b/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py
@@ -0,0 +1,578 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2024; NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+import torch
+import torch_npu
+from vllm.distributed.parallel_state import get_ep_group
+
+from vllm_ascend.distributed.tensor_parallel import (
+    all_gather_last_dim_from_tensor_parallel_region, all_to_all_hp2sp,
+    all_to_all_sp2hp, gather_from_sequence_parallel_region,
+    reduce_scatter_last_dim_to_tensor_parallel_region)
+from vllm_ascend.ops.comm_utils import async_all_to_all
+
+
+class MoEDispatcherConfig:
+
+    def __init__(self):
+        self.num_local_experts: int = 0
+        self.num_moe_experts: int = 0
+        self.moe_pad_expert_input_to_capacity: bool = False
+        self.moe_expert_capacity_factor: Optional[float] = None
+        self.moe_router_topk: int = 2
+        self.moe_grouped_gemm: bool = False
+        self.group_topk: int = 0
+        self.num_groups: int = 1
+        self.expert_bias: torch.Tensor = None
+        self.scaling_factor: Optional[float] = None
+        self.is_fused: bool = True
+
+    def set_num_local_experts(self, num_local_experts):
+        self.num_local_experts = num_local_experts
+        return self
+
+    def set_num_moe_experts(self, num_moe_experts):
+        self.num_moe_experts = num_moe_experts
+        return self
+
+    def set_moe_pad_expert_input_to_capacity(self,
+                                             moe_pad_expert_input_to_capacity):
+        self.moe_pad_expert_input_to_capacity = moe_pad_expert_input_to_capacity
+        return self
+
+    def set_moe_expert_capacity_factor(self, moe_expert_capacity_factor):
+        self.moe_expert_capacity_factor = moe_expert_capacity_factor
+        return self
+
+    def set_moe_router_topk(self, moe_router_topk):
+        self.moe_router_topk = moe_router_topk
+        return self
+
+    def set_moe_grouped_gemm(self, moe_grouped_gemm):
+        self.moe_grouped_gemm = moe_grouped_gemm
+        return self
+
+    def set_group_topk(self, group_topk):
+        self.group_topk = group_topk
+        return self
+
+    def set_num_groups(self, num_groups):
+        self.num_groups = num_groups
+        return self
+
+    def set_expert_bias(self, expert_bias):
+        self.expert_bias = expert_bias
+        return self
+
+    def set_scaling_factor(self, scaling_factor):
+        self.scaling_factor = scaling_factor
+        return self
+
+    def set_is_fused(self, is_fused):
+        self.is_fused = is_fused
+        return self
+
+    def build(self):
+        return self
+
+
+class MoEDispatcher:
+
+    def __init__(self, config: MoEDispatcherConfig) -> None:
+        """
+        Initialize the MoE Token Dispatcher.
+        """
+        self.config = config
+        self.shared_experts = None
+
+    def set_shared_experts(self, shared_experts):
+        self.shared_experts = shared_experts
+
+    @property
+    def ep_group(self):
+        """Get expert model parallel group."""
+        return get_ep_group().device_group
+
+    @property
+    def ep_rank(self):
+        return get_ep_group().rank_in_group
+
+    @property
+    def ep_size(self):
+        return get_ep_group().world_size
+
+    @property
+    def tp_ep_group(self):
+        """Get expert tensor and model parallel group."""
+        return None
+
+    @property
+    def tp_ep_size(self):
+        return 1
+
+
+class MoEAlltoAllSeqOverLapDispatcher(MoEDispatcher):
+    overlap_stream = None
+    """
+    The implementation of the AlltoAll-based token dispatcher, which handles token
+    dispatching on the sequence level instead of token level. The core of this implementation
+    lies in each device dispatching on the entire sequence, with the hidden state being partitioned.
+
+    """
+
+    def __init__(self, config: MoEDispatcherConfig):
+        """
+        Initialize the AlltoAllSeq token dispatcher.
+
+        Args:
+            config (MoEDispatcherConfig): Configuration for the transformer model.
+        """
+        super().__init__(config)
+        self.num_local_experts = config.num_local_experts
+        self.config = config
+        # use MOEAlltoAllSEQTokenDispatcher to init
+
+        self.hidden_shape = None
+        self.num_input_tokens = None
+        self.num_experts = config.num_moe_experts
+        assert self.num_local_experts > 0, "Expected at least one expert"
+        if self.num_local_experts > 1:
+            self.expert_ids_per_ep_rank = torch.tensor(
+                [i % self.num_local_experts for i in range(self.num_experts)],
+                dtype=torch.int32,
+                device=torch.npu.current_device(),
+            )
+
+        local_expert_indices_offset = (self.ep_rank * self.num_local_experts)
+
+        self.local_expert_indices = [
+            local_expert_indices_offset + i
+            for i in range(self.num_local_experts)
+        ]
+        assert (len(self.local_expert_indices) == self.num_local_experts
+                ), "Invalid local expert indices"
+        for i in range(len(self.local_expert_indices) - 1):
+            assert (self.local_expert_indices[i] ==
+                    self.local_expert_indices[i + 1] -
+                    1), "local_expert_indices must be continuous"
+        self.probs = None
+        self.input_splits = None
+        self.output_splits = None
+        self.routing_map = None
+        self.hidden_shape_before_permute = None
+
+        # [tp_ep_size * ep_size, num_local_experts]. Represents the number of tokens sent
+        # to each local expert by all ranks.
+        self.num_global_tokens_per_local_expert_cpu = None
+        self.num_global_tokens_per_local_expert = None
+
+        # A cuda stream synchronization is needed in self.token_permutation()
+        # in some cases, because there are several non-blocking DtoH data
+        # transfers called in self.preprocess(). The synchronization happens
+        # at different points based on MoE settings as late as possible.
+        # Valid sync points are "before_permutation_1", "before_ep_alltoall",
+        # "before_finish", and "no_sync".
+        self.device_sync_point = "no_sync"
+
+        # cached intermediate tensors.
+        self.cached_permutated_local_input_tokens = None
+        self.cached_global_input_tokens = None
+        self.cached_shared_expert_output = None
+        self.tokens_per_expert = None
+        self.perm1_finish_event = None
+        self.global_input_tokens_local_experts_indices = None
+
+        if MoEAlltoAllSeqOverLapDispatcher.overlap_stream is None:
+            MoEAlltoAllSeqOverLapDispatcher.overlap_stream = torch.npu.Stream()
+
+        self.overlap_stream = MoEAlltoAllSeqOverLapDispatcher.overlap_stream
+
+    def preprocess(self,
+                   indices: torch.Tensor,
+                   with_sync=True) -> torch.Tensor:
+        """
+        Preprocess routing map for AlltoAll communication and token permutation.
+        This method computes the number of tokens assigned to each expert based on
+        the routing map. It also initializes the necessary data structures for
+        AlltoAll communication, such as input and output splits, and the mapping
+        between global tokens and local experts.
+
+        Args:
+            routing_map (torch.Tensor): The mapping of tokens to experts, with shape
+                [num_tokens, num_experts].
+
+        Returns:
+            torch.Tensor: Tensor containing the number of tokens assigned to local expert.
+        """
+        num_local_tokens_per_expert = torch.histc(indices,
+                                                  bins=self.num_experts,
+                                                  min=0,
+                                                  max=self.num_experts)
+
+        # num_local_tokens_per_expert: [num_experts]
+
+        ep_size = self.ep_size
+
+        # Dropless
+        self.num_out_tokens = indices.numel()
+        if self.ep_size > 1 or self.num_local_experts > 1:
+            # Token dropless and enable ep. A synchronization is needed before expert parallel
+            # AlltoAll communication to get the `input_splits` and `output_splits` CPU values.
+            self.device_sync_point = "before_ep_alltoall"
+        else:
+            # Token dropless and no ep. A synchronization is needed to get the
+            # `tokens_per_expert` CPU value.
+            self.device_sync_point = "before_finish"
+
+        if ep_size > 1:
+            # ===================================================
+            # Calculate input_splits, output_splits for alltoall-v.
+            # ===================================================
+            self.input_splits = (num_local_tokens_per_expert.reshape(
+                ep_size, self.num_local_experts).sum(axis=1).to(
+                    torch.device("cpu"), non_blocking=True).numpy())
+            num_global_tokens_per_expert = gather_from_sequence_parallel_region(
+                num_local_tokens_per_expert,
+                group=self.ep_group).reshape(ep_size, self.num_experts)
+            self.num_global_tokens_per_local_expert = num_global_tokens_per_expert[:, self.local_expert_indices[
+                0]:self.local_expert_indices[-1] + 1]
+            if self.num_global_tokens_per_local_expert is None:
+                raise ValueError(
+                    "num_global_tokens_per_local_expert must be set before sum."
+                )
+            self.output_splits = (self.num_global_tokens_per_local_expert.sum(
+                axis=-1).to(torch.device("cpu"), non_blocking=True).numpy())
+            num_tokens_per_local_expert = self.num_global_tokens_per_local_expert.sum(
+                axis=0)
+            # ===================================================
+            # num_global_tokens_per_expert: [ep_size, num_experts]
+            # num_global_tokens_per_local_expert: [ep_size, num_local_experts]
+            # num_tokens_per_local_expert: [num_local_experts]
+            # ===================================================
+        else:
+            self.num_global_tokens_per_local_expert = num_local_tokens_per_expert.reshape(
+                -1, self.num_experts)
+            num_tokens_per_local_expert = num_local_tokens_per_expert
+
+        if self.num_local_experts > 1 and with_sync:
+            if self.num_global_tokens_per_local_expert is None:
+                raise ValueError(
+                    "num_global_tokens_per_local_expert must be set before operations."
+                )
+            self.device_sync_point = "no_sync"
+            self.global_input_tokens_local_experts_indices = torch.repeat_interleave(
+                self.expert_ids_per_ep_rank,
+                self.num_global_tokens_per_local_expert.ravel())
+
+        return num_tokens_per_local_expert
+
+    def token_permutation(
+        self,
+        hidden_states: torch.Tensor,
+        probs: torch.Tensor,
+        routing_map: torch.Tensor,
+    ):
+        """
+        Dispatch tokens to local experts using AlltoAllSeq communication.
+
+        Args:
+            hidden_states (torch.Tensor): Input token embeddings.
+            probs (torch.Tensor): Probs of tokens assigned to experts.
+                Shape: [num_tokens, num_experts].
+            routing_map (torch.Tensor): Mapping of tokens assigned to experts.
+                Shape: [num_tokens, num_experts].
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]:
+                - Permuted token embeddings for local experts.
+                - Number of tokens per expert.
+        """
+        self.hidden_shape = hidden_states.shape
+        self.probs = probs
+        self.top_indices = routing_map
+        assert probs.dim() == 2, "Expected 2D tensor for probs"
+        assert routing_map.dim() == 2, "Expected 2D tensor for routing map"
+
+        # Permutation 1: input to AlltoAll input
+        def alltoall_token_permutation1(hidden_states, routing_map):
+            assert self.hidden_shape is not None
+            hidden_states = hidden_states.view(-1, self.hidden_shape[-1])
+            tokens_per_expert = self.preprocess(routing_map)
+            if self.tp_ep_size > 1:
+                hidden_states = all_to_all_sp2hp(hidden_states,
+                                                 group=self.tp_ep_group)
+            self.hidden_shape_before_permute = hidden_states.shape
+
+            if self.device_sync_point == "before_permutation_1":
+                torch.npu.current_stream().synchronize()
+
+            permutated_local_input_tokens, reversed_local_input_permutation_mapping = torch_npu.npu_moe_token_permute(
+                tokens=hidden_states,
+                indices=self.top_indices,
+                num_out_tokens=self.num_out_tokens,
+            )
+            return permutated_local_input_tokens, reversed_local_input_permutation_mapping, tokens_per_expert
+
+        permutated_local_input_tokens, reversed_local_input_permutation_mapping, tokens_per_expert = alltoall_token_permutation1(
+            hidden_states, routing_map)
+        self.reversed_local_input_permutation_mapping = reversed_local_input_permutation_mapping
+        # permute 1
+
+        ep_group = self.ep_group
+
+        # Perform expert parallel AlltoAll communication
+        if self.device_sync_point == "before_ep_alltoall":
+            torch.npu.current_stream().synchronize()
+        _, global_input_tokens, permute1_ep_all_to_all_handle = async_all_to_all(
+            permutated_local_input_tokens,
+            self.output_splits,
+            self.input_splits,
+            ep_group,
+        )
+
+        # shared experts compute
+        if self.shared_experts is not None:
+            (share_experts_output), *_ = self.shared_experts(hidden_states)
+        else:
+            share_experts_output = None
+
+        permute1_ep_all_to_all_handle.wait()
+        permutated_local_input_tokens.untyped_storage().resize_(0)
+
+        def alltoall_token_permutation2(global_input_tokens):
+            # Permutation 2: Sort tokens by local expert.
+            if self.num_local_experts > 1:
+                global_input_tokens, self.reversed_global_input_permutation_mapping = torch_npu.npu_moe_token_permute(
+                    global_input_tokens,
+                    self.global_input_tokens_local_experts_indices)
+
+            # Perform tensor parallel AllGather on the hidden dimension to obtain the input tokens.
+            # global_input_tokens: [SEQL, H/TP] -> [SEQL, H]
+            if self.tp_ep_size > 1 and self.config.moe_grouped_gemm:
+                global_input_tokens = all_gather_last_dim_from_tensor_parallel_region(
+                    global_input_tokens, self.tp_ep_group)
+            if self.device_sync_point == "before_finish":
+                torch.npu.current_stream().synchronize()
+
+            return global_input_tokens
+
+        # token premute2 input
+        global_input_tokens = alltoall_token_permutation2(global_input_tokens)
+
+        return share_experts_output, global_input_tokens, tokens_per_expert
+
+    def preprocess_and_permtute1(self,
+                                 hidden_states: torch.Tensor,
+                                 probs: torch.Tensor,
+                                 routing_map: torch.Tensor,
+                                 shared_experts=None,
+                                 shared_experts_input: torch.Tensor = None):
+        self.hidden_shape = hidden_states.shape
+        self.probs = probs
+        self.top_indices = routing_map
+        assert probs.dim() == 2, "Expected 2D tensor for probs"
+        assert routing_map.dim() == 2, "Expected 2D tensor for routing map"
+        assert self.hidden_shape is not None
+
+        hidden_states = hidden_states.view(-1, self.hidden_shape[-1])
+        tokens_per_expert = self.preprocess(routing_map, with_sync=False)
+        self.hidden_shape_before_permute = hidden_states.shape
+
+        if self.device_sync_point == "before_permutation_1":
+            torch.npu.current_stream().synchronize()
+
+        event = torch.npu.current_stream().record_event()
+        self.perm1_finish_event = torch.npu.Event()
+        with torch.npu.stream(self.overlap_stream):
+            assert self.overlap_stream is not None
+            self.overlap_stream.wait_event(event)
+
+            if shared_experts is not None:
+                shared_output = shared_experts(shared_experts_input)
+                self.cached_shared_expert_output = shared_output
+
+            hidden_states, self.reversed_local_input_permutation_mapping = torch_npu.npu_moe_token_permute(
+                tokens=hidden_states,
+                indices=self.top_indices,
+                num_out_tokens=self.num_out_tokens,
+            )
+
+            self.perm1_finish_event.record()
+
+        # repeat interleve will launch a sync on current_stream.
+        if self.num_local_experts > 1:
+            self.device_sync_point = "no_sync"
+            if self.num_global_tokens_per_local_expert is None:
+                raise ValueError(
+                    "num_global_tokens_per_local_expert must be set before operations."
+                )
+            self.global_input_tokens_local_experts_indices = torch.repeat_interleave(
+                self.expert_ids_per_ep_rank,
+                self.num_global_tokens_per_local_expert.ravel())
+
+        self.cached_permutated_local_input_tokens = hidden_states
+        self.tokens_per_expert = tokens_per_expert
+
+    def dispatch_alltoall(self):
+        ep_group = self.ep_group
+
+        # Perform expert parallel AlltoAll communication
+        if self.device_sync_point == "before_ep_alltoall":
+            torch.npu.current_stream().synchronize()
+
+        torch.npu.current_stream().wait_event(self.perm1_finish_event)
+        self.perm1_finish_event = None
+        _, self.cached_global_input_tokens, permute1_ep_all_to_all_handle = async_all_to_all(
+            self.cached_permutated_local_input_tokens,
+            self.output_splits,
+            self.input_splits,
+            ep_group,
+        )
+        permute1_ep_all_to_all_handle.wait()
+        if self.cached_permutated_local_input_tokens is None:
+            raise ValueError(
+                "cached_permutated_local_input_tokens must be set before operations."
+            )
+        self.cached_permutated_local_input_tokens.untyped_storage().resize_(0)
+        self.cached_permutated_local_input_tokens = None
+
+    def permute2(self):
+        global_input_tokens = self.cached_global_input_tokens
+        if self.num_local_experts > 1:
+            global_input_tokens, self.reversed_global_input_permutation_mapping = torch_npu.npu_moe_token_permute(
+                self.cached_global_input_tokens,
+                self.global_input_tokens_local_experts_indices)
+            assert self.cached_global_input_tokens is not None
+            self.cached_global_input_tokens.untyped_storage().resize_(0)
+            self.cached_global_input_tokens = None
+
+        return global_input_tokens, self.tokens_per_expert
+
+    def unpermute1(self, hidden_states: torch.Tensor):
+        # Unpermutation 2: expert output to AlltoAll input
+        if hidden_states.shape[0] > 0 and self.num_local_experts > 1:
+            hidden_states = torch_npu.npu_moe_token_unpermute(
+                hidden_states, self.reversed_global_input_permutation_mapping)
+        self.cached_global_output_tokens = hidden_states
+        self.reversed_global_input_permutation_mapping = None
+
+    def combine_alltoall(self):
+        ep_group = self.ep_group
+        # Perform expert parallel AlltoAll communication
+        # hidden_states: [SEQL, H] -> [SEQL, H/TP]
+        _, self.cached_local_output_tokens, handle = async_all_to_all(
+            self.cached_global_output_tokens, self.input_splits,
+            self.output_splits, ep_group)
+        handle.wait()
+        self.cached_global_output_tokens.untyped_storage().resize_(0)
+        self.cached_global_output_tokens = None
+        self.input_splits = None
+        self.output_splits = None
+
+    def unpermute2(self):
+        output = torch_npu.npu_moe_token_unpermute(
+            permuted_tokens=self.cached_local_output_tokens,
+            sorted_indices=self.reversed_local_input_permutation_mapping.to(
+                torch.int32),
+            probs=self.probs,
+            restore_shape=self.hidden_shape_before_permute)
+
+        output = output.view(self.hidden_shape)
+
+        self.probs = None
+        self.reversed_local_input_permutation_mapping = None
+        self.cached_local_output_tokens.untyped_storage().resize_(0)
+        self.cached_local_output_tokens = None
+
+        return output
+
+    def token_unpermutation(self,
+                            hidden_states: torch.Tensor,
+                            bias: torch.Tensor = None):
+        """
+        Reverse the token permutation to restore the original order.
+
+        Args:
+            hidden_states (torch.Tensor): Output from local experts.
+            bias (torch.Tensor, optional): Bias tensor (not supported).
+
+        Returns:
+            Tuple[torch.Tensor, Optional[torch.Tensor]]:
+                - Unpermuted token embeddings in the original order.
+                - None (bias is not supported).
+        """
+
+        def alltoall_token_unpermutation1(hidden_states):
+            assert bias is None, "Bias is not supported in MoEAlltoAllSeqTokenDispatcher"
+            # Perform tensor parallel Reduce-Scatter
+            # hidden_states: [SEQL, H] -> [SEQL, H/TP]
+            if self.tp_ep_size > 1:
+                hidden_states = reduce_scatter_last_dim_to_tensor_parallel_region(
+                    hidden_states, group=self.tp_ep_group)
+
+            # Unpermutation 2: expert output to AlltoAll input
+            if hidden_states.shape[0] > 0 and self.num_local_experts > 1:
+                hidden_states = torch_npu.npu_moe_token_unpermute(
+                    hidden_states,
+                    self.reversed_global_input_permutation_mapping)
+
+            return hidden_states
+
+        hidden_states = alltoall_token_unpermutation1(hidden_states)
+
+        ep_group = self.ep_group
+        # Perform expert parallel AlltoAll communication
+        # hidden_states: [SEQL, H] -> [SEQL, H/TP]
+        _, permutated_local_input_tokens, handle = async_all_to_all(
+            hidden_states, self.input_splits, self.output_splits, ep_group)
+        handle.wait()
+        hidden_states.untyped_storage().resize_(0)
+
+        def alltoall_token_unpermutation2(permutated_local_input_tokens):
+            # Unpermutation 1: AlltoAll output to output
+
+            output = torch_npu.npu_moe_token_unpermute(
+                permuted_tokens=permutated_local_input_tokens,
+                sorted_indices=self.reversed_local_input_permutation_mapping.
+                to(torch.int32),
+                probs=self.probs,
+                restore_shape=self.hidden_shape_before_permute)
+
+            # Perform tensor parallel AlltoAll communication
+            # output: [S*B, H/TP] -> [S*B/TP, H]
+            if self.tp_ep_size > 1:
+                output = all_to_all_hp2sp(output, self.tp_ep_group)
+
+            # Reshape the output tensor
+            output = output.view(self.hidden_shape)
+            return output
+
+        output = alltoall_token_unpermutation2(permutated_local_input_tokens)
+
+        self.input_splits = None
+        self.output_splits = None
+        self.num_global_tokens_per_local_expert = None
+        self.num_global_tokens_per_local_expert_cpu = None
+
+        return output, None

From 63cb0622ea929f85029b4df838372d479a0e15c6 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Tue, 29 Jul 2025 14:35:36 +0800
Subject: [PATCH 02/56] [v0.9.1][Feature] add Moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/ops/fused_moe.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index 9b9e1a9fc7d..e09a7eec369 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -22,6 +22,8 @@
 import torch.distributed as dist
 import torch_npu
 from torch import nn
+from transformers import PretrainedConfig
+from vllm.attention import AttentionMetadata
 from vllm.config import get_current_vllm_config
 from vllm.distributed import (GroupCoordinator, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -45,6 +47,7 @@
     data_parallel_reduce_scatter
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
+from vllm_ascend.ops.moe_dispatcher.token_dispatcher import MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig
 from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
 from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
                                get_all_reduce_merge_state,

From 715e6f1de6c9d995004ed87a18ed3d4af23f27e7 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Tue, 29 Jul 2025 14:42:15 +0800
Subject: [PATCH 03/56] [v0.9.1][Feature] add Moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/ops/fused_moe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index e09a7eec369..19434390545 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -37,6 +37,7 @@
     FusedMoEParallelConfig  # isort: skip
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
+from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization.base_config import \
     QuantizationConfig
 

From 7c7e4e98a32c900af9579d924c38e9319df4dcff Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Tue, 29 Jul 2025 17:30:19 +0800
Subject: [PATCH 04/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/__init__.py  |   4 -
 vllm_ascend/models/qwen3_dbo.py | 552 --------------------------------
 vllm_ascend/ops/fused_moe.py    |   2 +-
 3 files changed, 1 insertion(+), 557 deletions(-)
 delete mode 100644 vllm_ascend/models/qwen3_dbo.py

diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py
index e3609f802d7..58c4a26c19c 100644
--- a/vllm_ascend/models/__init__.py
+++ b/vllm_ascend/models/__init__.py
@@ -41,10 +41,6 @@ def register_model():
             "DeepseekV3ForCausalLM",
             "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM")
 
-        ModelRegistry.register_model(
-            "Qwen3MoeForCausalLM",
-            "vllm_ascend.models.qwen3_dbo:CustomQwen3MoeForCausalLMDBO")
-
     else:
         ModelRegistry.register_model(
             "DeepseekV2ForCausalLM",
diff --git a/vllm_ascend/models/qwen3_dbo.py b/vllm_ascend/models/qwen3_dbo.py
deleted file mode 100644
index fa87fe81f22..00000000000
--- a/vllm_ascend/models/qwen3_dbo.py
+++ /dev/null
@@ -1,552 +0,0 @@
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-
-# """Inference-only Qwen3 model."""
-from types import SimpleNamespace
-from typing import List, Optional, Union
-
-import torch
-import torch_npu
-import vllm.model_executor.models.qwen3_moe as qwen3
-from torch import nn
-from transformers import PretrainedConfig
-from vllm.attention import AttentionMetadata
-from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group,
-                              get_tensor_model_parallel_world_size,
-                              get_tp_group)
-from vllm.forward_context import get_forward_context, set_forward_context
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.models.qwen3_moe import (Qwen3MoeDecoderLayer,
-                                                  Qwen3MoeForCausalLM,
-                                                  Qwen3MoeModel)
-from vllm.model_executor.models.utils import (
-    make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
-from vllm.sequence import IntermediateTensors
-
-import vllm_ascend.envs as envs_ascend
-from vllm_ascend.distributed.tensor_parallel import \
-    gather_from_sequence_parallel_region
-from vllm_ascend.multistream.base import MSEventKey
-from vllm_ascend.multistream.context import (
-    advance_step_multistream_layer_context, get_multistream_layer_context)
-from vllm_ascend.multistream.layers import (MultiStreamPostTransformerLayer,
-                                            MultiStreamPreTransformerLayer)
-from vllm_ascend.multistream.metadata import (MultiStreamConfig,
-                                              MultiStreamStepMetadata,
-                                              make_multistream_metadata_ds)
-from vllm_ascend.ops.fused_moe import (AscendSparseMoeBlock, apply_mlp,
-                                       select_experts)
-
-VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO
-
-
-class Qwen3MoeDecoderLayerDBO(Qwen3MoeDecoderLayer):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super(Qwen3MoeDecoderLayerDBO, self).__init__(config, cache_config,
-                                                      quant_config, prefix)
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tp_group().rank_in_group
-        self.tp_group = get_tp_group().device_group
-        self.dummy_vllm_config = SimpleNamespace(
-            parallel_config=SimpleNamespace(data_parallel_size=1, ),
-            compilation_config=SimpleNamespace(static_forward_context=None, ),
-            other_setting="value",
-        )
-        self.config = config
-
-    def forward(self, *args, **kwargs):
-        return super().forward(*args, **kwargs)
-
-    # should split ops in Decoder Layer
-    def _forward_ms_op_input_layernorm(
-        self,
-        hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        return hidden_states, residual
-
-    def _forward_ms_op_attn(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        residual: torch.Tensor,
-        kv_cache: Optional[torch.Tensor] = None,
-        attn_metadata: Optional[AttentionMetadata] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        self.dummy_vllm_config.compilation_config.static_forward_context = (
-            get_forward_context().no_compile_layers)
-        with set_forward_context(attn_metadata, self.dummy_vllm_config):
-            hidden_states = self.self_attn(
-                positions=positions,
-                hidden_states=hidden_states,
-            )
-        if hidden_states.dtype == torch.float16:
-            # Fix FP16 overflow
-            # We scale both hidden_states and residual before
-            # rmsnorm, and rmsnorm result would not affect by scale.
-            hidden_states *= 1.0 / self.routed_scaling_factor
-            if self.layer_idx == 0:
-                # The residual is shared by all layers, we only scale it on
-                # first layer.
-                residual *= 1.0 / self.routed_scaling_factor
-        return hidden_states, residual
-
-    def _forward_ms_op_post_attn_layernorm(
-        self,
-        hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ):
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        return hidden_states, residual
-
-    def _forward_op_gating(
-        self,
-        hidden_states: torch.Tensor,
-        attn_metadata: Optional[AttentionMetadata] = None,
-    ) -> torch.Tensor:
-        if attn_metadata is None:
-            attn_metadata = get_forward_context().attn_metadata
-        # when profile runs, force experts to load balanced tokens
-        # to avoid high memory consumption on a single rank.
-        enable_force_load_balance = get_forward_context().in_profile_run
-
-        num_tokens, hidden_dim = hidden_states.shape
-
-        if self.tp_size > 1:
-            # pass
-            num_tokens, hidden_size = hidden_states.shape
-            if num_tokens < self.tp_size:
-                hidden_states = nn.functional.pad(
-                    hidden_states, (0, 0, 0, self.tp_size - num_tokens))
-            chunk_hidden_states = torch.tensor_split(hidden_states,
-                                                     self.tp_size,
-                                                     dim=0)
-            chunked_hidden_states_sizes = [
-                x.shape[0] for x in chunk_hidden_states
-            ]
-            local_hidden_states = chunk_hidden_states[self.tp_rank]
-        else:
-            local_hidden_states = hidden_states
-            chunked_hidden_states_sizes = None
-
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.mlp.gate(local_hidden_states)
-
-        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
-        mlp_config = self.config
-        if mlp_config.num_experts == 256:
-            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
-                router_logits,
-                k=mlp_config.num_experts_per_tok,  # topk当前写8
-                bias=self.mlp.gate.e_score_correction_bias,
-                k_group=mlp_config.topk_group,  # fix: 4
-                group_count=mlp_config.n_group,  # fix 8
-                group_select_mode=1,  # 0: max in group; 1: topk2.sum(fix)
-                renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
-                norm_type=1,  # 0: softmax; 1: sigmoid(fix)
-                routed_scaling_factor=1,
-                eps=float(1e-20),
-            )
-        else:
-            topk_weights, topk_ids = select_experts(
-                hidden_states=local_hidden_states,
-                router_logits=router_logits,
-                top_k=mlp_config.num_experts_per_tok,
-                use_grouped_topk=False,
-                renormalize=mlp_config.norm_topk_prob,
-                topk_group=getattr(mlp_config, "topk_group", None),
-                num_expert_group=getattr(mlp_config, "n_group", None),
-                custom_routing_function=None,
-                scoring_func=getattr(mlp_config, "scoring_func", "softmax"),
-                e_score_correction_bias=getattr(self.mlp.gate,
-                                                "e_score_correction_bias",
-                                                None),
-            )
-
-        topk_weights = topk_weights.to(hidden_states.dtype)
-        # this is a naive implementation for experts load balance so as
-        # to avoid accumulating too much tokens on a single rank.
-        # currently it is only activated when doing profile runs.
-        if enable_force_load_balance:
-            topk_ids = torch.randint_like(topk_ids, 0, self.config.num_experts)
-
-        return topk_weights, topk_ids, local_hidden_states, chunked_hidden_states_sizes
-
-    def _forward_op_grouped_mlp(self, dispatched_input, tokens_per_expert):
-        return apply_mlp(
-            dispatched_input,
-            self.mlp.experts.w13_weight,
-            self.mlp.experts.w2_weight,
-            tokens_per_expert,
-        )
-
-    def _forward_combine_comm(self, hidden_states, microbatch_id, num_tokens,
-                              chunked_hidden_states_sizes):
-        token_dispatcher = self.mlp.experts.token_dispatchers[microbatch_id]
-        final_hidden_states, _ = token_dispatcher.token_unpermutation(
-            hidden_states)
-        if hasattr(self.mlp, "routed_scaling_factor"):
-            final_hidden_states = final_hidden_states * self.mlp.routed_scaling_factor
-
-        if self.tp_size > 1:
-            final_hidden_states = gather_from_sequence_parallel_region(
-                final_hidden_states, self.tp_group,
-                chunked_hidden_states_sizes)
-            if num_tokens < self.tp_size:
-                final_hidden_states = final_hidden_states[:num_tokens]
-
-        if hasattr(self.mlp, "shared_experts"):
-            final_hidden_states = (
-                final_hidden_states +
-                token_dispatcher.cached_shared_expert_output)
-            token_dispatcher.cached_shared_expert_output.untyped_storage(
-            ).resize_(0)
-            token_dispatcher.cached_shared_expert_output = None
-
-        final_hidden_states = final_hidden_states.view(num_tokens, -1)
-
-        return final_hidden_states
-
-    def _forward_ms_layer_alltoallv_finegrained(
-        self,
-        positions: List[torch.Tensor],
-        hidden_states: List[torch.Tensor],
-        residual: List[torch.Tensor],
-        attn_metadata: List[AttentionMetadata],
-        kv_cache: Optional[torch.Tensor] = None,
-    ):
-        layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
-        )
-        assert layer_index >= 0 and ms_metadata is not None
-        num_micro_batchs = ms_metadata.ms_config.num_micro_batches
-        assert len(positions) == num_micro_batchs
-        assert len(hidden_states) == num_micro_batchs
-        assert residual is not None
-        assert attn_metadata is not None
-        num_tokens = [None] * num_micro_batchs
-        hidden_dims = [None] * num_micro_batchs
-        topk_weights, topk_ids = [None] * num_micro_batchs, [
-            None
-        ] * num_micro_batchs
-        tokens_per_expert = [None] * num_micro_batchs
-        dispatched_input = [None] * num_micro_batchs
-        router_expert_output = [None] * num_micro_batchs
-        chunked_hidden_states_sizes = [None] * num_micro_batchs
-        token_dispatchers = self.mlp.experts.token_dispatchers
-
-        def discard_tensor(tensor):
-            if isinstance(tensor, torch.Tensor):
-                tensor = [tensor]
-            for t in tensor:
-                t.untyped_storage().resize_(0)
-
-        # block 1 : attention
-        # block 2 : Router Gating
-        # block 3 : Token DisPatch
-        # the attn computation of microbatch 1 can be overlapped with the moe
-        # communication in the previous layer, and the attn computation of microbatch 2
-        # can be overlapped with the attn communication of microbatch 1
-        for i in range(num_micro_batchs):
-            forward_context = get_forward_context()
-            layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
-            )
-            ms_metadata.try_wait_event(layer_index - 1, i,
-                                       MSEventKey.FFN_AR_FINISH)
-            forward_context.attn_metadata = attn_metadata[i]
-
-            # input layernorm
-            hidden_states[i], residual[
-                i] = self._forward_ms_op_input_layernorm(
-                    hidden_states[i], residual[i])
-            # attention and tp allreduce
-            hidden_states[i], residual[i] = self._forward_ms_op_attn(
-                positions[i], hidden_states[i], residual[i], kv_cache,
-                attn_metadata[i])
-            # post attention layer norm
-            hidden_states[i], residual[
-                i] = self._forward_ms_op_post_attn_layernorm(
-                    hidden_states[i], residual[i])
-            num_tokens[i], hidden_dims[i] = hidden_states[i].shape
-            # If TP is enabled, hidden_states will be chunked.
-            (
-                topk_weights[i],
-                topk_ids[i],
-                dispatched_input[i],
-                chunked_hidden_states_sizes[i],
-            ) = self._forward_op_gating(hidden_states[i], attn_metadata[i])
-            token_dispatchers[i].preprocess_and_permtute1(
-                dispatched_input[i],
-                topk_weights[i],
-                topk_ids[i],
-                shared_experts=None,
-                shared_experts_input=None,
-            )
-            # Launch DisPatch Comm in a New Stream.
-            dispatch_context = MultiStreamStepMetadata(
-                comm_stream=ms_metadata.communicate_stream,
-                before_comm_event=ms_metadata.ms_events[layer_index][i][
-                    MSEventKey.MOE_BEFORE_COMM],
-                after_comm_event=ms_metadata.ms_events[layer_index][i][
-                    MSEventKey.MOE_AFTER_COMM],
-            )
-            dispatch_context.before_comm_event.record()
-            # print_with_sync(f'begin token dispatch{i}...', torch.distributed.get_rank())
-            with torch.npu.stream(dispatch_context.comm_stream):
-                dispatch_context.comm_stream.wait_event(
-                    dispatch_context.before_comm_event)
-                token_dispatchers[i].dispatch_alltoall()
-                dispatched_input[i], tokens_per_expert[i] = token_dispatchers[
-                    i].permute2()
-                dispatch_context.after_comm_event.record()
-
-        # print_with_sync('begin experts...', torch.distributed.get_rank())
-        # block 4 : Router Experts Computation
-        # block 5 : Token Combine Communication
-        for i in range(num_micro_batchs):
-            ms_metadata.try_wait_event(layer_index, i,
-                                       MSEventKey.MOE_AFTER_COMM)
-            discard_tensor(hidden_states[i])
-            router_expert_output[i] = self._forward_op_grouped_mlp(
-                dispatched_input[i], tokens_per_expert[i])
-            discard_tensor(dispatched_input[i])
-
-            # Launch Combine Comm in a New Stream.
-            combine_context = MultiStreamStepMetadata(
-                comm_stream=ms_metadata.communicate_stream,
-                before_comm_event=ms_metadata.ms_events[layer_index][i][
-                    MSEventKey.FFN_COM_FINISH],
-                after_comm_event=ms_metadata.ms_events[layer_index][i][
-                    MSEventKey.FFN_AR_FINISH],
-            )
-            combine_context.before_comm_event.record()
-            ms_metadata.try_wait_event(layer_index, i,
-                                       MSEventKey.MOE_SE_COMM_FINISH)
-            with torch.npu.stream(combine_context.comm_stream):
-                combine_context.comm_stream.wait_event(
-                    combine_context.before_comm_event)
-                hidden_states[i] = self._forward_combine_comm(
-                    router_expert_output[i],
-                    i,
-                    num_tokens[i],
-                    chunked_hidden_states_sizes[i],
-                )
-                ms_metadata.ms_events[layer_index][i][
-                    MSEventKey.
-                    FFN_AR_FINISH] = combine_context.comm_stream.record_event(
-                    )
-
-        return hidden_states, residual
-
-
-@support_torch_compile
-class CustomQwen3DBOMoEModel(Qwen3MoeModel):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        nn.Module.__init__(self)
-
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.config = config
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-            prefix=f"{prefix}.embed_tokens")
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: Qwen3MoeDecoderLayerDBO(
-                config=config,
-                cache_config=cache_config,
-                quant_config=quant_config,
-                prefix=prefix,
-            ),
-            prefix=f"{prefix}.layers",
-        )
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
-            ["hidden_states", "residual"], config.hidden_size)
-
-        # dbo related members
-        if VLLM_ASCEND_ENABLE_DBO:
-            self.use_mla = False
-            self.multistream_config = MultiStreamConfig()
-            multistream_metadata = make_multistream_metadata_ds(
-                start_layer=self.start_layer,
-                end_layer=self.end_layer,
-                causal_lm=getattr(config, "causal_lm", True),
-                multistream_config=self.multistream_config,
-            )
-            self.ms_pre_layer = MultiStreamPreTransformerLayer(
-                multistream_metadata)
-            self.ms_post_layer = MultiStreamPostTransformerLayer(
-                multistream_metadata)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if get_pp_group().is_first_rank:
-            if inputs_embeds is not None:
-                hidden_states = inputs_embeds
-            else:
-                hidden_states = self.get_input_embeddings(input_ids)
-            residual = None
-        else:
-            assert intermediate_tensors is not None
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-
-        num_normal_layers = (0 if VLLM_ASCEND_ENABLE_DBO and self.can_run_ms()
-                             else self.end_layer - self.start_layer)
-
-        moe_start_layer = self.start_layer + num_normal_layers
-        for i in range(self.start_layer, min(moe_start_layer, self.end_layer)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states, residual)
-
-        if moe_start_layer < self.end_layer:
-            # if we enable multistream/dbo, process sparse layers here
-            hidden_states, residual = self._forward_ms_layers(
-                positions=positions,
-                hidden_states=hidden_states,
-                residual=residual,
-                moe_start_layer=moe_start_layer,
-            )
-
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
-
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-    def can_run_ms(self):
-        attn_metadata = get_forward_context().attn_metadata
-        # enable prefill overlap
-        with_prefill = get_forward_context().with_prefill
-        if (attn_metadata is None or not with_prefill
-                or not attn_metadata.enable_dbo_across_dp):
-            return False
-
-        return True
-
-    def _forward_ms_layers(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        residual: torch.Tensor,
-        moe_start_layer: int,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-    ):
-
-        if moe_start_layer == self.end_layer:
-            return hidden_states, residual
-
-        attn_metadata, [positions, hidden_states,
-                        residual] = self.ms_pre_layer(
-                            [positions, hidden_states, residual], )
-        num_micro_batch = len(attn_metadata)
-        # the rest layers
-        for i in range(moe_start_layer, self.end_layer):
-            layer = self.layers[i]
-            ms_layer_forward_func = layer._forward_ms_layer_alltoallv_finegrained
-            # print("get_called......")
-            hidden_states, residual = ms_layer_forward_func(
-                positions=positions,
-                hidden_states=hidden_states,
-                residual=residual,
-                attn_metadata=attn_metadata,
-            )
-            advance_step_multistream_layer_context()
-
-        layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
-        )
-        for i in range(num_micro_batch):
-            ms_metadata.try_wait_event(layer_index - 1, i,
-                                       MSEventKey.FFN_AR_FINISH)
-
-        [hidden_states,
-         residual] = self.ms_post_layer([hidden_states, residual], )
-        return hidden_states, residual
-
-
-class CustomQwen3MoeForCausalLMDBO(Qwen3MoeForCausalLM):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
-    }
-    qwen3.Qwen3MoeSparseMoeBlock = AscendSparseMoeBlock
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        nn.Module.__init__(self)
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = CustomQwen3DBOMoEModel(vllm_config=vllm_config,
-                                            prefix=maybe_prefix(
-                                                prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def forward(self, *args, **kwargs):
-        if "graph_enable" in kwargs:
-            kwargs.pop("graph_enable")
-        return super().forward(*args, **kwargs)
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index 19434390545..1ed1c298f49 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -1483,11 +1483,11 @@ def forward(self,
                 dist.all_gather(list(chunk_hidden_states), e_hidden_states,
                                 self.tp_group)
                 final_hidden_states = torch.cat(chunk_hidden_states, dim=0)
+                dispose_tensor(e_hidden_states)
             else:
                 final_hidden_states = e_hidden_states
             if num_tokens < padding_size:
                 final_hidden_states = final_hidden_states[:num_tokens]
-            dispose_tensor(e_hidden_states)
         elif self.dp_size > 1:
             if fused_moe_state == FusedMoEState.NaiveMulticast:
                 start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[

From 7106d77e1ca605462fb427ac229813a56fcd4eb1 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Tue, 29 Jul 2025 18:46:53 +0800
Subject: [PATCH 05/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py
index 58c4a26c19c..0b1b67a4f19 100644
--- a/vllm_ascend/models/__init__.py
+++ b/vllm_ascend/models/__init__.py
@@ -50,9 +50,9 @@ def register_model():
             "DeepseekV3ForCausalLM",
             "vllm_ascend.models.deepseek_v3:CustomDeepseekV3ForCausalLM")
 
-        ModelRegistry.register_model(
-            "Qwen3MoeForCausalLM",
-            "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM")
+    ModelRegistry.register_model(
+        "Qwen3MoeForCausalLM",
+        "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM")
 
     ModelRegistry.register_model(
         "Qwen3ForCausalLM", "vllm_ascend.models.qwen3:CustomQwen3ForCausalLM")

From 1b53047491d34c37f5ecb633e48918ab60c7e59c Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Tue, 29 Jul 2025 18:48:47 +0800
Subject: [PATCH 06/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/attention/attention_v1.py |  13 ---
 vllm_ascend/multistream/ms_split.py   | 115 +-------------------------
 2 files changed, 1 insertion(+), 127 deletions(-)

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 01b51e15607..668c802c400 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -28,7 +28,6 @@
 from vllm.utils import direct_register_custom_op
 from vllm.v1.core.sched.output import SchedulerOutput
 
-from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
                                nd_to_nz_2d, nd_to_nz_spec)
@@ -151,18 +150,6 @@ class AscendMetadata:
     # (num_tokens,)
     slot_mapping: torch.Tensor = None
 
-    def split_metadata_for_multistream(
-        self,
-        ms_split_config: MSAttentionMetadataSplitConfig,
-    ) -> list["AscendMetadata"]:
-        """Split metadata for multi-stream with AscendMetadata"""
-        from vllm_ascend.multistream.ms_split import model_input_split_v1_attn
-        return model_input_split_v1_attn(
-            ms_split_config=ms_split_config,
-            attn_metadata=self,
-            _metadata_cls=AscendMetadata,
-        )
-
 
 class AscendAttentionMetadataBuilder:
 
diff --git a/vllm_ascend/multistream/ms_split.py b/vllm_ascend/multistream/ms_split.py
index 684f6aea136..3af6337e473 100644
--- a/vllm_ascend/multistream/ms_split.py
+++ b/vllm_ascend/multistream/ms_split.py
@@ -4,8 +4,7 @@
 import numpy as np
 import torch
 
-from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
-                                                AscendMetadata)
+from vllm_ascend.attention.attention_v1 import AscendAttentionState
 
 from .base import MSAttentionMetadataSplitConfig
 
@@ -242,115 +241,3 @@ def model_input_split_v1_mla_attn(
         decode=decode_post,
     )
     return [attention_metadata_pre, attention_metadata_post]
-
-
-def model_input_split_v1_attn(
-    attn_metadata: AscendMetadata,
-    _metadata_cls,
-    ms_split_config: MSAttentionMetadataSplitConfig,
-) -> List[Any]:
-    assert 0 < ms_split_config.num_micro_batches < 3
-    if attn_metadata is None:
-        return [attn_metadata]
-    [token_index,
-     seq_index] = compute_split_seq_index(attn_metadata.query_lens,
-                                          attn_metadata.attn_state,
-                                          attn_metadata.num_actual_tokens)
-    if token_index == 0 or seq_index == 0 or seq_index == len(
-            attn_metadata.query_lens):
-        return [attn_metadata]
-
-    # split attn metadata
-
-    [block_table_pre,
-     block_table_post] = split_attn_tensor_type(attn_metadata.block_tables,
-                                                seq_index)
-
-    query_start_loc_pre = query_start_loc_post = None
-    if attn_metadata.query_start_loc is not None:
-        query_start_loc_pre = attn_metadata.query_start_loc[:seq_index + 1]
-        query_start_loc_post = deepcopy(
-            attn_metadata.query_start_loc[seq_index:]
-        ) - attn_metadata.query_start_loc[seq_index]
-
-    [query_lens_pre,
-     query_lens_post] = split_attn_tensor_type(attn_metadata.query_lens,
-                                               seq_index)
-    [seq_lens_pre,
-     seq_lens_post] = split_attn_tensor_type(attn_metadata.seq_lens, seq_index)
-
-    max_query_len_pre = max_query_len_post = None
-    if attn_metadata.max_query_len is not None:
-        max_query_len_pre, max_query_len_post = max(query_lens_pre), max(
-            query_lens_post)
-
-    [slot_mapping_pre,
-     slot_mapping_post] = split_attn_tensor_type(attn_metadata.slot_mapping,
-                                                 token_index)
-
-    is_only_prefill_pre = is_only_prefill_post = attn_metadata.is_only_prefill
-    has_prefill_pre, _ = torch.any(query_lens_pre > 1).item(), torch.any(
-        query_lens_post > 1).item()
-
-    if not attn_metadata.is_only_prefill:
-        is_only_prefill_post = torch.all(query_lens_post > 1).item()
-
-    if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache or attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
-        # the attn_mla kernel in torch npu only accept 128*128 attn mask
-        attn_mask_pre = attn_mask_post = attn_metadata.attn_mask
-        attn_state_pre = attn_state_post = attn_metadata.attn_state
-    elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
-        # should be none in decode only state
-        attn_mask_pre = attn_mask_post = attn_metadata.attn_mask
-        attn_state_pre = attn_state_post = AscendAttentionState.DecodeOnly  # type: ignore
-    else:
-        # chunked prefill
-        assert attn_metadata.attn_mask is not None
-        if has_prefill_pre:
-            attn_state_pre = attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
-            attn_mask_pre = attn_metadata.attn_mask[:token_index, :max(
-                seq_lens_pre)].contiguous()
-            attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
-            attn_mask_post = attn_metadata.attn_mask[
-                token_index:, :max(seq_lens_post)].contiguous()
-        else:
-            attn_state_pre = AscendAttentionState.DecodeOnly  # type: ignore
-            attn_mask_pre = None
-            attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
-            attn_mask_post = attn_metadata.attn_mask[
-                token_index:, :max(seq_lens_post)].contiguous()
-
-    # construct metadata
-    attention_metadata_pre = _metadata_cls(
-        num_actual_tokens=token_index,
-        block_tables=block_table_pre,
-        query_start_loc=query_start_loc_pre,
-        query_lens=query_lens_pre,
-        seq_lens=seq_lens_pre,
-        seq_lens_list=seq_lens_pre.tolist(),
-        max_query_len=max_query_len_pre,
-        slot_mapping=slot_mapping_pre,
-        is_only_prefill=is_only_prefill_pre,
-        attn_state=attn_state_pre,
-        attn_mask=attn_mask_pre,
-        num_input_tokens=token_index,
-        enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp,
-    )
-
-    attention_metadata_post = _metadata_cls(
-        num_actual_tokens=attn_metadata.num_actual_tokens - token_index,
-        block_tables=block_table_post,
-        query_start_loc=query_start_loc_post,
-        query_lens=query_lens_post,
-        seq_lens=seq_lens_post,
-        seq_lens_list=seq_lens_post.tolist(),
-        max_query_len=max_query_len_post,
-        slot_mapping=slot_mapping_post,
-        is_only_prefill=is_only_prefill_post,
-        attn_state=attn_state_post,
-        attn_mask=attn_mask_post,
-        num_input_tokens=attn_metadata.num_input_tokens - token_index,
-        enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp,
-    )
-
-    return [attention_metadata_pre, attention_metadata_post]

From a8635071a183e4dfd96fb6b667b2c9e6eb8effb5 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Tue, 29 Jul 2025 19:19:29 +0800
Subject: [PATCH 07/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 .../test_offline_inference_distributed.py     | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index fa19ec3eab7..5330c59b10c 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -175,6 +175,28 @@ def test_models_distributed_topk() -> None:
         vllm_model.generate(example_prompts, sampling_params)
 
 
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ": "1"})
+def test_models_distributed_topk() -> None:
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
+        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
+        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
+    ]
+    dtype = "half"
+    sampling_params = SamplingParams(max_tokens=5,
+                                     temperature=0.0,
+                                     top_k=50,
+                                     top_p=0.9)
+
+    with VllmRunner(
+            "deepseek-ai/DeepSeek-V2-Lite",
+            dtype=dtype,
+            tensor_parallel_size=4,
+            distributed_executor_backend="mp",
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+
+
 def test_models_distributed_Qwen3_W8A8():
     example_prompts = [
         "Hello, my name is",

From a702414fdc62719baaddc08c2a188b5eee00aee8 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Tue, 29 Jul 2025 19:46:02 +0800
Subject: [PATCH 08/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/e2e/multicard/test_offline_inference_distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index 5330c59b10c..9ea67a1dd3e 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -176,7 +176,7 @@ def test_models_distributed_topk() -> None:
 
 
 @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ": "1"})
-def test_models_distributed_topk() -> None:
+def test_models_distributed_alltoallv() -> None:
     example_prompts = [
         "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
         "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",

From 5841dc837ac3a7856107b13f2e94215d6538b450 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 09:28:47 +0800
Subject: [PATCH 09/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/ops/fused_moe.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index 1ed1c298f49..fbe187235f3 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -16,7 +16,7 @@
 # Adapted from vllm/tests/kernels/test_moe.py
 
 import os
-from typing import Any, Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
@@ -48,7 +48,8 @@
     data_parallel_reduce_scatter
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
-from vllm_ascend.ops.moe_dispatcher.token_dispatcher import MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig
+from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
+    MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig)
 from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
 from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
                                get_all_reduce_merge_state,

From 3542670701d82d2f30399678078fe2e47c784f64 Mon Sep 17 00:00:00 2001
From: whx <56632993+whx-sjtu@users.noreply.github.com>
Date: Tue, 29 Jul 2025 18:06:45 +0800
Subject: [PATCH 10/56] [Perf] Avoid performing index selection of sin/cos
 cache every layer (#1890)

Optimize number of index selections of sin/cos cache.

- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/656c24f1b5d8a662e9ec391503d146341def5f18

Signed-off-by: whx-sjtu <2952154980@qq.com>
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/attention/test_mla_v1.py     | 17 ++++++
 vllm_ascend/attention/mla_v1.py       | 75 +++++++++++++++++++--------
 vllm_ascend/worker/model_runner_v1.py |  3 ++
 3 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py
index 303571840ff..7f8d5f75c6e 100644
--- a/tests/ut/attention/test_mla_v1.py
+++ b/tests/ut/attention/test_mla_v1.py
@@ -331,15 +331,30 @@ def test_build_dummy(self, mock_ascend_config):
         runner.chunked_prefill_enabled = False
         runner.attn_mask = torch.zeros((1, 1), dtype=torch.bool)
         runner.spec_attn_mask = torch.zeros((1, 1), dtype=torch.bool)
+        runner.dtype = torch.float16
 
         builder = AscendMLAMetadataBuilder(runner=runner,
                                            metadata_cls=AscendMLAMetadata)
+        builder.rope_dim = 64
 
         with patch.object(builder,
                           "_get_graph_runner_block_tables",
                           side_effect=lambda x, y: y):
             metadata = builder.build_torchair_graph_dummy(3, 3)
 
+        sin_golden = torch.ones(3,
+                                1,
+                                1,
+                                64,
+                                dtype=runner.dtype,
+                                device=runner.device)
+        cos_golden = torch.ones(3,
+                                1,
+                                1,
+                                64,
+                                dtype=runner.dtype,
+                                device=runner.device)
+
         self.assertIsInstance(metadata, AscendMLAMetadata)
         self.assertEqual(metadata.num_input_tokens, 3)
         self.assertEqual(metadata.num_actual_tokens, 3)
@@ -354,6 +369,8 @@ def test_build_dummy(self, mock_ascend_config):
         self.assertEqual(metadata.seq_lens.shape[0], 3)
         self.assertEqual(metadata.slot_mapping.shape[0], 3)
         self.assertEqual(metadata.query_start_loc.shape[0], 3)
+        assert torch.equal(sin_golden, metadata.decode.sin)
+        assert torch.equal(cos_golden, metadata.decode.cos)
 
 
 class TestAscendMLAImpl(TestBase):
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
index 5d993e00e14..4e247562cf7 100644
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -80,6 +80,8 @@ class ChunkedContextMetadata:
     max_query_len: int
     max_seq_lens: int
     chunked_context: Optional[ChunkedContextMetadata] = None
+    sin: torch.Tensor = None
+    cos: torch.Tensor = None
 
 
 @dataclass
@@ -92,6 +94,8 @@ class AscendMLADecodeMetadata:
     max_seq_lens: int
     seq_lens_list: list[int]
     attn_mask: Optional[torch.Tensor] = None
+    sin: torch.Tensor = None
+    cos: torch.Tensor = None
 
 
 @dataclass
@@ -200,6 +204,9 @@ def __init__(self,
             )
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.rope_dim = self.runner.model_config.hf_text_config.qk_rope_head_dim
+        self.cos_cache = None
+        self.sin_cache = None
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
@@ -318,13 +325,27 @@ def build_torchair_graph_dummy(
                                      -1,
                                      dtype=torch.int32,
                                      device=device)
+        sin = torch.ones(num_reqs,
+                         1,
+                         1,
+                         self.rope_dim,
+                         dtype=self.runner.dtype,
+                         device=device)
+        cos = torch.ones(num_reqs,
+                         1,
+                         1,
+                         self.rope_dim,
+                         dtype=self.runner.dtype,
+                         device=device)
         decode_metadata = AscendMLADecodeMetadata(
             input_positions=input_positions,
             block_table=block_table,
             seq_lens=seq_lens,
             seq_lens_list=seq_lens.tolist(),
             max_seq_lens=1,
-            attn_mask=self.runner.spec_attn_mask)
+            attn_mask=self.runner.spec_attn_mask,
+            sin=sin,
+            cos=cos)
         return self.metadata_cls(  # type: ignore
             num_input_tokens=num_actual_tokens,
             num_actual_tokens=num_actual_tokens,
@@ -370,6 +391,16 @@ def build(
         seq_lens = seq_lens_cpu
         max_query_len = query_lens.max().item()
         max_seq_lens = seq_lens.max().item()
+        if self.cos_cache is None:
+            self.cos_cache = self.runner.get_model(
+            ).model.layers[0].self_attn.rotary_emb.cos_cached
+            self.sin_cache = self.runner.get_model(
+            ).model.layers[0].self_attn.rotary_emb.sin_cached
+        if self.cos_cache.dtype != self.runner.dtype:  # type: ignore
+            self.cos_cache = self.cos_cache.to(  # type: ignore
+                self.runner.dtype)  # type: ignore
+            self.sin_cache = self.sin_cache.to(  # type: ignore
+                self.runner.dtype)  # type: ignore
 
         prefill_metadata = None
         chunked_context_metadata = None
@@ -415,18 +446,26 @@ def build(
                     chunk_seq_lens=chunk_seq_lens,
                     workspace=self.chunked_prefill_workspace,
                 )
-
+            prefill_input_positions = input_positions[tokens_start:]
+            cos = self.cos_cache[
+                prefill_input_positions].unsqueeze(  # type: ignore
+                    1).unsqueeze(2)
+            sin = self.sin_cache[
+                prefill_input_positions].unsqueeze(  # type: ignore
+                    1).unsqueeze(2)
             prefill_metadata = AscendMLAPrefillMetadata(
                 attn_mask=self.runner.attn_mask,
                 query_lens=query_lens[tokens_start:],
                 seq_lens=seq_lens,
                 context_lens=seq_lens[tokens_start:],
-                input_positions=input_positions[tokens_start:],
+                input_positions=prefill_input_positions,
                 block_table=block_table[reqs_start:, ...],
                 max_query_len=max_query_len,
                 max_seq_lens=max_seq_lens,
                 query_start_loc=prefill_query_start_loc,
                 chunked_context=chunked_context_metadata,
+                sin=sin,
+                cos=cos,
             )
 
         decode_metadata = None
@@ -467,6 +506,10 @@ def build(
                                         dtype=input_positions.dtype,
                                         device=input_positions.device)
                 input_positions = torch.cat([input_positions, padding_0])
+            cos = self.cos_cache[input_positions].unsqueeze(  # type: ignore
+                1).unsqueeze(2)
+            sin = self.sin_cache[input_positions].unsqueeze(  # type: ignore
+                1).unsqueeze(2)
 
             decode_metadata = AscendMLADecodeMetadata(
                 input_positions=input_positions,
@@ -474,7 +517,9 @@ def build(
                 seq_lens=seq_lens,
                 seq_lens_list=seq_lens.tolist(),
                 max_seq_lens=max_seq_lens,
-                attn_mask=self.runner.spec_attn_mask)
+                attn_mask=self.runner.spec_attn_mask,
+                sin=sin,
+                cos=cos)
 
         return self.metadata_cls(  # type: ignore
             num_actual_tokens=num_actual_tokens,
@@ -1069,15 +1114,8 @@ def forward(
             decode_k_nope = None
             assert attn_metadata.decode is not None
             if self.running_in_graph:
-                seq_len = self.rotary_emb.max_position_embeddings * self.rotary_emb.scaling_factor
-                cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=decode_hs_or_q_c.dtype)
-                sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=decode_hs_or_q_c.dtype)
-                cos = cos[attn_metadata.decode.input_positions]
-                sin = sin[attn_metadata.decode.input_positions]
-                cos = cos[:, None, None, :]
-                sin = sin[:, None, None, :]
+                cos = attn_metadata.decode.cos
+                sin = attn_metadata.decode.sin
                 with npu_stream_switch("mla_secondary",
                                        0,
                                        enabled=enable_multistream_mla):
@@ -1124,15 +1162,8 @@ def forward(
             prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim]
             if self.torchair_graph_enabled:
                 num_tokens = prefill_hs_or_q_c.shape[0]
-                seq_len = self.rotary_emb.max_position_embeddings * self.rotary_emb.scaling_factor
-                cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=prefill_q_pe.dtype)
-                sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=prefill_q_pe.dtype)
-                cos = cos[attn_metadata.prefill.input_positions]
-                sin = sin[attn_metadata.prefill.input_positions]
-                cos = cos[:, None, None, :]
-                sin = sin[:, None, None, :]
+                cos = attn_metadata.prefill.cos
+                sin = attn_metadata.prefill.sin
 
                 prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin)
                 prefill_k_pe, prefill_k_nope = self.exec_kv_prefill(
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 4c008b46caf..2bee8dd442d 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1799,6 +1799,9 @@ def _dummy_run(
                             attn_metadata.decode.input_positions)
                         torch._dynamo.mark_static(
                             get_forward_context().mc2_mask)
+                        if hasattr(attn_metadata.decode, "sin"):
+                            torch._dynamo.mark_static(attn_metadata.decode.sin)
+                            torch._dynamo.mark_static(attn_metadata.decode.cos)
                         torch._dynamo.mark_static(attn_metadata.slot_mapping)
                         for kv in self.kv_caches:
                             assert isinstance(

From 05f2ff26b11b421fafb5c35dc76909fdff82542c Mon Sep 17 00:00:00 2001
From: curryliu <99582471+Irving11-BKN@users.noreply.github.com>
Date: Tue, 29 Jul 2025 18:51:57 +0800
Subject: [PATCH 11/56] [Feature] Enable inference support for
 Deepseekr1-w8a8-MTP (#1994)

Support the inference of the Deepseekr1-w8a8-mtp model with
statically-quantized shared_head in MTP layers.

- vLLM version: v0.9.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/6eca337ce09e7dfa05ce57c4183ddb5d4488c85e

Signed-off-by: curryliu <120010041@link.cuhk.edu.cn>
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/deepseek_mtp.py       | 23 ++++++++++++++++++++---
 vllm_ascend/models/deepseek_v2.py        |  4 +++-
 vllm_ascend/quantization/quant_config.py | 23 +++++++++++++++++++++++
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/vllm_ascend/models/deepseek_mtp.py b/vllm_ascend/models/deepseek_mtp.py
index 3cbc62e80d9..bd78115ad55 100644
--- a/vllm_ascend/models/deepseek_mtp.py
+++ b/vllm_ascend/models/deepseek_mtp.py
@@ -28,8 +28,8 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import \
-    VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.models.deepseek_mtp import (
     DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
     SharedHead)
@@ -40,6 +40,20 @@
 from .deepseek_v2 import CustomDeepseekV2DecoderLayer
 
 
+class CustomDeepSeekShareHead(SharedHead):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        nn.Module.__init__(self)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.head = ParallelLMHead(config.vocab_size,
+                                   config.hidden_size,
+                                   quant_config=quant_config,
+                                   prefix=maybe_prefix(prefix, "head"))
+
+
 class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer):
 
     def __init__(
@@ -61,7 +75,10 @@ def __init__(
         self.eh_proj = nn.Linear(config.hidden_size * 2,
                                  config.hidden_size,
                                  bias=False)
-        self.shared_head = SharedHead(config=config, quant_config=quant_config)
+        self.shared_head = CustomDeepSeekShareHead(config=config,
+                                                   quant_config=quant_config,
+                                                   prefix=maybe_prefix(
+                                                       prefix, "shared_head"))
         self.mtp_block = CustomDeepseekV2DecoderLayer(config, prefix,
                                                       model_config,
                                                       cache_config,
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
index cb0649a7ffa..129e5eb3a85 100644
--- a/vllm_ascend/models/deepseek_v2.py
+++ b/vllm_ascend/models/deepseek_v2.py
@@ -868,7 +868,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(config.vocab_size,
                                           config.hidden_size,
-                                          quant_config=quant_config)
+                                          quant_config=quant_config,
+                                          prefix=maybe_prefix(
+                                              prefix, "lm_head"))
         else:
             self.lm_head = PPMissingLayer()
         self.logits_processor = LogitsProcessor(config.vocab_size)
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index 7c7ee580334..5984dc74dc5 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -34,6 +34,8 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    UnquantizedEmbeddingMethod, VocabParallelEmbedding)
 from vllm.model_executor.parameter import PerTensorScaleParameter
 from vllm.model_executor.utils import set_weight_attrs
 
@@ -107,6 +109,12 @@ def get_quant_method(self, layer: torch.nn.Module,
                 return AscendUnquantizedFusedMoEMethod()
             return AscendFusedMoEMethod(self, prefix,
                                         self.packed_modules_mapping)
+        elif isinstance(layer, VocabParallelEmbedding):
+            if self.is_layer_skipped_ascend(prefix,
+                                            self.packed_modules_mapping):
+                return UnquantizedEmbeddingMethod()
+            return AscendEmbeddingMethod(self, prefix,
+                                         self.packed_modules_mapping)
         return None
 
     def is_layer_skipped_ascend(
@@ -319,3 +327,18 @@ def apply(
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if hasattr(self.quant_method, "process_weights_after_loading"):
             self.quant_method.process_weights_after_loading(layer)
+
+
+class AscendEmbeddingMethod(AscendLinearMethod):
+    """Embedding method for Ascend quantization.
+      This class calls AscendQuantizer to search a specific quantization
+      implementations supported on ascend hardware for Embedding methods.
+      Args:
+          quant_config: The Ascend quantization config.
+    """
+
+    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
+                 packed_modules_mapping: Dict[str, Any]) -> None:
+        self.quantizer = AscendQuantizer.get_quantizer(
+            quant_config.quant_description, prefix, packed_modules_mapping)
+        self.quant_method = self.quantizer.build_linear_method()
\ No newline at end of file

From 5255de25d566ca3cc8f8129e184b5874c2ab0024 Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Tue, 29 Jul 2025 18:59:05 +0800
Subject: [PATCH 12/56] [CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2
 max-parallel to speed up CI (#2065)

### What this PR does / why we need it?
Currently our workflow run time takes about 3 hours in total, which
seriously affects the developer experience, so it is urgent to have a
optimization, after this pr, It is expected that the running time of the
full CI can be shortened to 1h40min.

- Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB)
- Change TP4 ---> TP2 * 2 max-parallel
- Move DeepSeek-V2-Lite-W8A8 to single card test

### Does this PR introduce _any_ user-facing change?
No

- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/a2480251ec92ba2a849464dde48db8a2b7f6ef81

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 .github/actionlint.yaml                       |  8 ++--
 .github/workflows/accuracy_test.yaml          |  4 +-
 .github/workflows/vllm_ascend_doctest.yaml    |  2 +-
 .github/workflows/vllm_ascend_test.yaml       |  7 ++--
 .../workflows/vllm_ascend_test_long_term.yaml |  2 +-
 benchmarks/scripts/run_accuracy.py            | 12 +++---
 .../disaggregated_prefill_v1/gen_ranktable.sh |  2 +-
 .../long_term/accuracy/accuracy_multicard.py  |  4 +-
 .../multicard/test_fused_moe_allgather_ep.py  |  4 +-
 .../test_offline_inference_distributed.py     | 32 ++++----------
 tests/e2e/multicard/test_pipeline_parallel.py |  2 +-
 .../e2e/multicard/test_torchair_graph_mode.py |  4 +-
 tests/e2e/singlecard/quant/test_w8a8.py       | 42 +++++++++++++++++++
 .../e2e/singlecard/test_offline_inference.py  | 25 -----------
 14 files changed, 75 insertions(+), 75 deletions(-)
 create mode 100644 tests/e2e/singlecard/quant/test_w8a8.py

diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index 78ea6f3bdfb..3b4d23f078f 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -1,8 +1,10 @@
 self-hosted-runner:
   # Labels of self-hosted runner in array of strings.
   labels:
-    - linux-arm64-npu-1
-    - linux-arm64-npu-2
-    - linux-arm64-npu-4
+    - linux-aarch64-a2-0
+    - linux-aarch64-a2-1
+    - linux-aarch64-a2-2
+    - linux-aarch64-a2-4
+    - linux-aarch64-a2-8
     - linux-arm64-npu-static-8
     - ubuntu-24.04-arm
diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml
index 949e76b5605..0a98feb186f 100644
--- a/.github/workflows/accuracy_test.yaml
+++ b/.github/workflows/accuracy_test.yaml
@@ -85,8 +85,8 @@ jobs:
       }}
     runs-on: >-
       ${{
-          (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-arm64-npu-4') ||
-          'linux-arm64-npu-2'
+          (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-aarch64-a2-2') ||
+          'linux-aarch64-a2-1'
       }}
     strategy:
       matrix:
diff --git a/.github/workflows/vllm_ascend_doctest.yaml b/.github/workflows/vllm_ascend_doctest.yaml
index 25746dbac3e..1b4faeacba8 100644
--- a/.github/workflows/vllm_ascend_doctest.yaml
+++ b/.github/workflows/vllm_ascend_doctest.yaml
@@ -48,7 +48,7 @@ jobs:
       matrix:
         vllm_verison: [v0.9.1-dev, v0.9.1-dev-openeuler, main, main-openeuler]
     name: vLLM Ascend test
-    runs-on: linux-arm64-npu-1
+    runs-on: linux-aarch64-a2-1
     container:
       image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:${{ matrix.vllm_verison }}
     steps:
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index 14d56ab47dd..580559c9483 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -136,7 +136,7 @@ jobs:
     strategy:
       max-parallel: 2
       matrix:
-        os: [linux-arm64-npu-1]
+        os: [linux-aarch64-a2-1]
         vllm_version: [main, v0.10.0]
     name: singlecard e2e test
     runs-on: ${{ matrix.os }}
@@ -213,9 +213,9 @@ jobs:
     needs: [e2e]
     if: ${{ needs.e2e.result == 'success' }}
     strategy:
-      max-parallel: 1
+      max-parallel: 2
       matrix:
-        os: [linux-arm64-npu-4]
+        os: [linux-aarch64-a2-2]
         vllm_version: [main, v0.10.0]
     name: multicard e2e test
     runs-on: ${{ matrix.os }}
@@ -275,7 +275,6 @@ jobs:
           # To avoid oom, we need to run the test in a single process.
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
           pytest -sv tests/e2e/multicard/test_data_parallel.py
diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml
index d8af7890673..0dfa7e30944 100644
--- a/.github/workflows/vllm_ascend_test_long_term.yaml
+++ b/.github/workflows/vllm_ascend_test_long_term.yaml
@@ -42,7 +42,7 @@ jobs:
     strategy:
       max-parallel: 2
       matrix:
-        os: [linux-arm64-npu-1, linux-arm64-npu-4]
+        os: [linux-aarch64-a2-1, linux-aarch64-a2-2]
         vllm_version: [main, v0.10.0]
     name: vLLM Ascend long term test
     runs-on: ${{ matrix.os }}
diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py
index 35b59bf992d..cc2f4e22da6 100644
--- a/benchmarks/scripts/run_accuracy.py
+++ b/benchmarks/scripts/run_accuracy.py
@@ -50,17 +50,17 @@
 # Command templates for running evaluations
 MODEL_RUN_INFO = {
     "Qwen/Qwen3-30B-A3B": (
-        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
+        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
         "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
         "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
     ),
     "Qwen/Qwen3-8B-Base": (
-        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
+        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6'\n"
         "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
         "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
     ),
     "Qwen/Qwen2.5-VL-7B-Instruct": (
-        "export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n"
+        "export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2'\n"
         "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
         "--apply_chat_template --fewshot_as_multiturn  --batch_size 1"
     ),
@@ -94,9 +94,9 @@
 
 # Model arguments for evaluation
 MODEL_ARGS = {
-    "Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6",
-    "Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2",
-    "Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True",
+    "Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6",
+    "Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2",
+    "Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True",
 }
 
 # Whether to apply chat template formatting
diff --git a/examples/disaggregated_prefill_v1/gen_ranktable.sh b/examples/disaggregated_prefill_v1/gen_ranktable.sh
index 33d4a32e8db..e8a923a909e 100644
--- a/examples/disaggregated_prefill_v1/gen_ranktable.sh
+++ b/examples/disaggregated_prefill_v1/gen_ranktable.sh
@@ -76,4 +76,4 @@ if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
         --master_addr ${MASTER_ADDR} \
         --master_port ${MASTER_PORT} \
         gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT
-fi
\ No newline at end of file
+fi
diff --git a/tests/e2e/long_term/accuracy/accuracy_multicard.py b/tests/e2e/long_term/accuracy/accuracy_multicard.py
index 9dd77a9bbc7..2bfb389e4b0 100644
--- a/tests/e2e/long_term/accuracy/accuracy_multicard.py
+++ b/tests/e2e/long_term/accuracy/accuracy_multicard.py
@@ -91,9 +91,9 @@
     "Qwen/Qwen2.5-0.5B-Instruct":
     None,
     "Qwen/Qwen3-30B-A3B":
-    "tensor_parallel_size=4,enable_expert_parallel=True,enforce_eager=True",
+    "tensor_parallel_size=2,enable_expert_parallel=True,enforce_eager=True",
     "deepseek-ai/DeepSeek-V2-Lite":
-    "tensor_parallel_size=4,trust_remote_code=True,enforce_eager=True"
+    "tensor_parallel_size=2,trust_remote_code=True,enforce_eager=True"
 }
 
 multiprocessing.set_start_method("spawn", force=True)
diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
index e804d74d908..916ce05dad5 100644
--- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py
+++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -46,7 +46,7 @@ def test_generate_with_allgather():
     sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
 
     with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"),
-                    tensor_parallel_size=4,
+                    tensor_parallel_size=2,
                     enforce_eager=True,
                     max_model_len=1024,
                     dtype="auto",
@@ -74,7 +74,7 @@ def test_generate_with_alltoall():
     sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
 
     with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"),
-                    tensor_parallel_size=4,
+                    tensor_parallel_size=2,
                     enforce_eager=True,
                     max_model_len=1024,
                     dtype="auto",
diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index 9ea67a1dd3e..bf1269a8f62 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -42,7 +42,7 @@ def test_models_distributed_QwQ():
     with VllmRunner(
             "Qwen/QwQ-32B",
             dtype=dtype,
-            tensor_parallel_size=4,
+            tensor_parallel_size=2,
             distributed_executor_backend="mp",
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -57,7 +57,7 @@ def test_models_distributed_DeepSeek_multistream_moe():
     with VllmRunner(
             "vllm-ascend/DeepSeek-V3-Pruning",
             dtype=dtype,
-            tensor_parallel_size=4,
+            tensor_parallel_size=2,
             distributed_executor_backend="mp",
             additional_config={
                 "torchair_graph_config": {
@@ -82,7 +82,7 @@ def test_models_distributed_DeepSeek_dbo():
     with VllmRunner(
             "deepseek-ai/DeepSeek-V2-Lite",
             dtype=dtype,
-            tensor_parallel_size=4,
+            tensor_parallel_size=2,
             distributed_executor_backend="mp",
     ) as vllm_model:
         model_arch = 'DeepseekV2ForCausalLM'
@@ -106,7 +106,7 @@ def test_models_distributed_DeepSeekV3_dbo():
     with VllmRunner(
             "vllm-ascend/DeepSeek-V3-Pruning",
             dtype=dtype,
-            tensor_parallel_size=4,
+            tensor_parallel_size=2,
             distributed_executor_backend="mp",
     ) as vllm_model:
         model_arch = 'DeepseekV3ForCausalLM'
@@ -118,24 +118,6 @@ def test_models_distributed_DeepSeekV3_dbo():
         vllm_model.generate(example_prompts, sampling_params)
 
 
-@pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in")
-def test_models_distributed_DeepSeek_W8A8():
-    example_prompts = [
-        "Hello, my name is",
-    ]
-    max_tokens = 5
-
-    with VllmRunner(
-            snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
-            max_model_len=8192,
-            enforce_eager=True,
-            dtype="auto",
-            tensor_parallel_size=4,
-            quantization="ascend",
-    ) as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
-
-
 def test_models_distributed_pangu():
     example_prompts = [
         "Hello, my name is",
@@ -147,7 +129,7 @@ def test_models_distributed_pangu():
             max_model_len=8192,
             enforce_eager=True,
             dtype="auto",
-            tensor_parallel_size=4,
+            tensor_parallel_size=2,
             distributed_executor_backend="mp",
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -169,7 +151,7 @@ def test_models_distributed_topk() -> None:
     with VllmRunner(
             "deepseek-ai/DeepSeek-V2-Lite",
             dtype=dtype,
-            tensor_parallel_size=4,
+            tensor_parallel_size=2,
             distributed_executor_backend="mp",
     ) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
@@ -208,7 +190,7 @@ def test_models_distributed_Qwen3_W8A8():
             max_model_len=8192,
             enforce_eager=True,
             dtype="auto",
-            tensor_parallel_size=4,
+            tensor_parallel_size=2,
             quantization="ascend",
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/e2e/multicard/test_pipeline_parallel.py b/tests/e2e/multicard/test_pipeline_parallel.py
index 612744ede15..8dd3a9015ba 100644
--- a/tests/e2e/multicard/test_pipeline_parallel.py
+++ b/tests/e2e/multicard/test_pipeline_parallel.py
@@ -22,7 +22,7 @@
     "Qwen/Qwen3-0.6B",
 ]
 
-TENSOR_PARALLELS = [2]
+TENSOR_PARALLELS = [1]
 PIPELINE_PARALLELS = [2]
 DIST_EXECUTOR_BACKEND = ["mp", "ray"]
 
diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py
index 9d83d98f324..9ad336c19c7 100644
--- a/tests/e2e/multicard/test_torchair_graph_mode.py
+++ b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -30,7 +30,7 @@
 def _deepseek_torchair_test_fixture(
     additional_config: Dict,
     *,
-    tensor_parallel_size=4,
+    tensor_parallel_size=2,
 ):
     example_prompts = [
         "Hello, my name is",
@@ -98,7 +98,7 @@ def test_e2e_deepseekv3_with_torchair_ms_mla():
 def _pangu_torchair_test_fixture(
     additional_config: Dict,
     *,
-    tensor_parallel_size=4,
+    tensor_parallel_size=2,
 ):
     example_prompts = [
         "Hello, my name is",
diff --git a/tests/e2e/singlecard/quant/test_w8a8.py b/tests/e2e/singlecard/quant/test_w8a8.py
new file mode 100644
index 00000000000..6123d9b7378
--- /dev/null
+++ b/tests/e2e/singlecard/quant/test_w8a8.py
@@ -0,0 +1,42 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+import pytest
+from modelscope import snapshot_download  # type: ignore[import-untyped]
+
+from tests.e2e.conftest import VllmRunner
+
+MODELS = [
+    "vllm-ascend/DeepSeek-V2-Lite-W8A8",
+    "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_quant_W8A8(example_prompts, model):
+    max_tokens = 5
+    model_path = snapshot_download(model)
+    with VllmRunner(
+            model_path,
+            max_model_len=8192,
+            enforce_eager=True,
+            dtype="auto",
+            gpu_memory_utilization=0.7,
+            quantization="ascend",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/e2e/singlecard/test_offline_inference.py b/tests/e2e/singlecard/test_offline_inference.py
index c6c68e55e8e..687bb2d3532 100644
--- a/tests/e2e/singlecard/test_offline_inference.py
+++ b/tests/e2e/singlecard/test_offline_inference.py
@@ -25,7 +25,6 @@
 
 import pytest
 import vllm  # noqa: F401
-from modelscope import snapshot_download  # type: ignore[import-untyped]
 from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
@@ -40,9 +39,6 @@
 MULTIMODALITY_VL_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"]
 MULTIMODALITY_AUDIO_MODELS = ["Qwen/Qwen2-Audio-7B-Instruct"]
 
-QUANTIZATION_MODELS = [
-    "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8",
-]
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 AUDIO_ASSETS = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
 AUDIO_PROMPT_TEMPLATES = {
@@ -70,27 +66,6 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
-@pytest.mark.parametrize("model", QUANTIZATION_MODELS)
-@pytest.mark.parametrize("max_tokens", [5])
-def test_quantization_models(model: str, max_tokens: int) -> None:
-    prompt = "The following numbers of the sequence " + ", ".join(
-        str(i) for i in range(1024)) + " are:"
-    example_prompts = [prompt]
-
-    # NOTE: Using quantized model repo id from modelscope encounters an issue,
-    # this pr (https://github.com/vllm-project/vllm/pull/19212) fix the issue,
-    # after it is being merged, there's no need to download model explicitly.
-    model_path = snapshot_download(model)
-
-    with VllmRunner(model_path,
-                    max_model_len=8192,
-                    enforce_eager=True,
-                    dtype="auto",
-                    gpu_memory_utilization=0.7,
-                    quantization="ascend") as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
-
-
 @pytest.mark.parametrize("model", MULTIMODALITY_VL_MODELS)
 def test_multimodal_vl(model, prompt_template, vllm_runner):
     image = ImageAsset("cherry_blossom") \

From d33cf650c719cfdf0e62ebbf9b5b800873c5a6fb Mon Sep 17 00:00:00 2001
From: TaoYu Chen <ctynb@qq.com>
Date: Tue, 29 Jul 2025 19:07:17 +0800
Subject: [PATCH 13/56] bump default python version to 3.11 (#2072)

### What this PR does / why we need it?
Bump default python version to 3.11, see #1980

### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
pass CI

- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/12a223ef9bfebcc61e477047dce049495fe8c8a8

Signed-off-by: ChenTaoyu-SJTU <ctynb@qq.com>
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 .github/Dockerfile.buildwheel      | 2 +-
 .github/workflows/pre-commit.yml   | 2 +-
 .github/workflows/release_code.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/Dockerfile.buildwheel b/.github/Dockerfile.buildwheel
index 285f6c5a7f5..da8628677a5 100644
--- a/.github/Dockerfile.buildwheel
+++ b/.github/Dockerfile.buildwheel
@@ -14,7 +14,7 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
-ARG PY_VERSION=3.10
+ARG PY_VERSION=3.11
 FROM quay.io/ascend/manylinux:8.0.0-910b-manylinux_2_28-py${PY_VERSION}
 
 ARG COMPILE_CUSTOM_KERNELS=1
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 25b802a65e0..e41dd6e634e 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -14,7 +14,7 @@ jobs:
       uses: actions/checkout@v4
     - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
       with:
-        python-version: "3.10"
+        python-version: "3.11"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
     - name: Checkout vllm-project/vllm repo
diff --git a/.github/workflows/release_code.yml b/.github/workflows/release_code.yml
index a11312c1a7a..f64fcd0fc19 100644
--- a/.github/workflows/release_code.yml
+++ b/.github/workflows/release_code.yml
@@ -41,7 +41,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.11"]
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 

From 9ded27a7fed25b11e36ccf2d608f44fa9c6fb9ac Mon Sep 17 00:00:00 2001
From: taoxudonghaha <justsheldon@163.com>
Date: Tue, 29 Jul 2025 19:27:50 +0800
Subject: [PATCH 14/56] Add  Custom Kernels For LoRA Performance (#1884)

### What this PR does / why we need it?
Add two custom kernels(bgmv_shrink and bgmv expand) to solve the
performance of LoRA
### Does this PR introduce _any_ user-facing change?
no user-facing change
### How was this patch tested?
we add Unit Test file to test the custom ascendc kernel. See
vllm-ascend/tests/e2e/singlecard/ops/test_bgmv_expand.py and
vllm-ascend/tests/e2e/singlecard/ops/test_bgmv_expand.py
Based on the actual test of the QWen2.5 7B model using vllm-ascend
version v0.9.2.rc1, the TTFT, TPOT and throughput have increased by
about 70%.

- vLLM version: v0.9.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/40d86ee412eeeca93e0c37432db6b96829cb64e2

---------

Signed-off-by: taoxudonghaha <justsheldon@163.com>
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 csrc/kernels/bgmv_expand.cpp                  | 369 ++++++++++++++++++
 csrc/kernels/bgmv_shrink.cpp                  | 252 ++++++++++++
 csrc/ops.h                                    |  28 ++
 csrc/torch_binding.cpp                        |  92 +++++
 tests/e2e/singlecard/ops/test_bgmv_expand.py  |  41 ++
 tests/e2e/singlecard/ops/test_bgmv_shrink.py  |  40 ++
 vllm_ascend/lora/punica_wrapper/lora_ops.py   | 112 ++++++
 vllm_ascend/lora/punica_wrapper/punica_npu.py |  15 +-
 8 files changed, 946 insertions(+), 3 deletions(-)
 create mode 100644 csrc/kernels/bgmv_expand.cpp
 create mode 100644 csrc/kernels/bgmv_shrink.cpp
 create mode 100644 tests/e2e/singlecard/ops/test_bgmv_expand.py
 create mode 100644 tests/e2e/singlecard/ops/test_bgmv_shrink.py
 create mode 100644 vllm_ascend/lora/punica_wrapper/lora_ops.py

diff --git a/csrc/kernels/bgmv_expand.cpp b/csrc/kernels/bgmv_expand.cpp
new file mode 100644
index 00000000000..84a4f094054
--- /dev/null
+++ b/csrc/kernels/bgmv_expand.cpp
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include "types.h"
+
+template <typename scalar_t>
+class BGMVExpand {
+public:
+    using X_T = float;
+    using W_T = scalar_t;
+    using Y_T = scalar_t;
+
+    static constexpr uint64_t LORA_RANK_8 = 8;
+    static constexpr uint64_t LORA_RANK_16 = 16;
+    static constexpr uint64_t LORA_RANK_32 = 32;
+    static constexpr uint64_t LORA_RANK_64 = 64;
+    static constexpr uint64_t SUPPORTED_RANKS[] = {LORA_RANK_8, LORA_RANK_16, LORA_RANK_32, LORA_RANK_64};
+    static constexpr int32_t BUFFER_NUM = 2;
+
+    // The vector unit reads 8 blocks (32 bytes each and 256 bytes in total) of contiguous data each time.
+    static constexpr int32_t NUM_BYTES_PER_REPEAT = 256;
+    static constexpr int32_t NUM_BLOCKS_PER_REPEAT = 8;
+    // The maximum number of elements in a single iteration is 256 / sizeof(intermediate data type).
+    static constexpr int32_t NUM_ELEMENTS_PER_REPEAT = NUM_BYTES_PER_REPEAT / sizeof(float);
+    // Mask is used to control the elements that participate in computation in each iteration.
+    static constexpr int32_t MASK_COUNT = NUM_BYTES_PER_REPEAT / sizeof(float);
+    // Refer to numOutputElementsPerInputTile_ initialization for the constraints on the following constants.
+    static constexpr int32_t W_IN_TILE_NUM_ELEMENTS = 8192;
+    static constexpr int32_t Y_OUT_TILE_NUM_ELEMENTS = 4096;
+    static constexpr int32_t BLOCK_REDUCE_NUM_REPEATS = W_IN_TILE_NUM_ELEMENTS / NUM_ELEMENTS_PER_REPEAT;
+    // BlockReduceSum would generate(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT)floats. 
+    // So need to read them all and apply PairReduceSum
+    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_16 = 
+        (BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
+    // The second PairReduceSum for rank=32, needs half of the repetition that happened for rank=16.
+    // Same for rank=64, we do not support ranks greater than 64.
+    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_32 = (PAIR_REDUCE_NUM_REPEATS_16 + 1) / 2;
+
+public:
+    __aicore__ inline BGMVExpand(AscendC::TPipe* pipe) : pipe_(pipe) {}
+
+    __aicore__ inline void Init(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,
+                                __gm__ void* yIn, __gm__ void* yOut,
+                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
+                                uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
+    {
+        batchSize_ = batchSize;
+        numTokensPerCore_ = numTokensPerCore;
+        maxLoRARank_ = maxLoRARank;
+        outputHiddenDim_ = outputHiddenDim;
+        sliceOffset_ = sliceOffset;
+        outputFullDim_ = outputFullDim;
+        singleLoRAWeightLen_ = maxLoRARank_ * outputHiddenDim_;
+
+        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
+        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
+        yInGm_.SetGlobalBuffer((__gm__ Y_T *)yIn);
+        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)yOut);
+        indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices);
+
+        pipe_->InitBuffer(inQueueX_, 1, NUM_ELEMENTS_PER_REPEAT * sizeof(X_T));
+        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, W_IN_TILE_NUM_ELEMENTS * sizeof(W_T));
+        pipe_->InitBuffer(inQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
+        pipe_->InitBuffer(outQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
+
+        pipe_->InitBuffer(dupBufferX_, NUM_ELEMENTS_PER_REPEAT * sizeof(float));
+        pipe_->InitBuffer(tmpBufferW_, W_IN_TILE_NUM_ELEMENTS * sizeof(float));
+        pipe_->InitBuffer(inBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
+        pipe_->InitBuffer(tmpBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
+
+        // Each compute iteration would generate not one, but several output elements.
+        // Therefore, the following variable would determine how many output elements are calculated in each iteration.
+        numOutputElementsPerInputTile_ = BLOCK_REDUCE_NUM_REPEATS * (NUM_ELEMENTS_PER_REPEAT / maxLoRARank_);
+        numStreamInPerOutputTile_ = Y_OUT_TILE_NUM_ELEMENTS / numOutputElementsPerInputTile_;
+
+    }
+
+    __aicore__ inline void Process()
+    {
+        int64_t blockIdx = AscendC::GetBlockIdx();
+        int64_t startIdx = blockIdx * numTokensPerCore_;
+        int64_t endIdx = startIdx + numTokensPerCore_;
+        if (endIdx > batchSize_) {
+            endIdx = batchSize_;
+        }
+        for (int64_t idx = startIdx; idx < endIdx; idx++) {
+            yOffset_ = outputFullDim_ * idx + sliceOffset_;
+
+            // Set up LoRA index
+            CopyInIndex(idx);
+            if (reqLoRAIndex_ < 0) {
+                continue;
+            }
+            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
+
+            CopyInX(idx);
+            int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
+            for (int32_t i = 0; i < numStreamOut; i++) {
+                CopyInY(i);
+                for (int32_t j = 0; j < numStreamInPerOutputTile_; j++) {
+                    CopyInW(i * numStreamInPerOutputTile_ + j);
+                    Compute(j * numOutputElementsPerInputTile_);
+                }
+                ScaleOutput();
+                CopyOut(i);
+            }
+            ComputeLastIteration();
+        }
+    }
+
+private:
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // Look up the LoRA index
+        reqLoRAIndex_ = indicesGm_.GetValue(idx);
+    }
+
+    __aicore__ inline void ComputeLastIteration()
+    {
+        int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS;
+        if (remainingY == 0) {
+            return;
+        }
+        int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
+        int32_t remainingW = remainingY * maxLoRARank_;
+        int32_t numCompleteWTileInForLastIteration = remainingW / W_IN_TILE_NUM_ELEMENTS;
+        int32_t remainingWForLastRepeat = remainingW % W_IN_TILE_NUM_ELEMENTS;
+
+        CopyInY(numStreamOut, remainingY);
+
+        int32_t outputIdx = 0;
+        for (outputIdx = 0; outputIdx < numCompleteWTileInForLastIteration; outputIdx++) {
+            CopyInW(numStreamOut * numStreamInPerOutputTile_ + outputIdx);
+            Compute(outputIdx * numOutputElementsPerInputTile_);
+        }
+
+        if (remainingWForLastRepeat != 0) {
+            CopyInW(numStreamOut * numStreamInPerOutputTile_ + numCompleteWTileInForLastIteration,
+                    remainingWForLastRepeat);
+            int32_t lastRepeatCount = remainingWForLastRepeat / NUM_ELEMENTS_PER_REPEAT;
+            int32_t pairReduceRepeat16 = 
+                (lastRepeatCount * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
+            int32_t pairReduceRepeat32 = (pairReduceRepeat16 + 1) / 2;
+            int32_t lastComputeOutputElement = outputIdx * numOutputElementsPerInputTile_;
+            Compute(lastComputeOutputElement, lastRepeatCount, pairReduceRepeat16, pairReduceRepeat32);
+        }
+
+        ScaleOutput(remainingY);
+        CopyOut(numStreamOut, remainingY);
+    }
+
+    __aicore__ inline void CopyInX(const int64_t idx)
+    {
+        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
+        if constexpr (std::is_same_v<X_T, float>) {
+            DataCopy(xLocal, xGm_[maxLoRARank_ * idx], maxLoRARank_);
+        } else {
+            uint16_t blockLen = static_cast<uint16_t>(maxLoRARank_ * sizeof(X_T));
+            DataCopyPad(xLocal, xGm_[maxLoRARank_ * idx], {1, blockLen, 0, 0}, {});
+        }
+        inQueueX_.EnQue(xLocal);
+        xLocal = inQueueX_.DeQue<X_T>();
+        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
+
+        // As we are generating multiple output elements with one API invocation,
+        // we need to duplicate the X vector multiple times to fill one NUM_BYTES_PER_REPEAT
+        if constexpr (std::is_same_v<X_T, float>) {
+            for (int32_t i = 0; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
+                for (int32_t j = 0; j < maxLoRARank_; j++) {
+                    float entry = xLocal.GetValue(j);
+                    xDup.SetValue(i + j, entry);
+                }
+            }
+        } else {
+            Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
+            pipe_barrier(PIPE_V);
+
+            for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
+                for (int32_t j = 0; j < maxLoRARank_; j++) {
+                    float entry = xDup.GetValue(j);
+                    xDup.SetValue(i + j, entry);
+                }
+            }
+        }
+        inQueueX_.FreeTensor(xLocal);
+    }
+
+    __aicore__ inline void CopyInY(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.AllocTensor<Y_T>();
+        DataCopy(yInLocal, yInGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], numElements);
+        inQueueY_.EnQue(yInLocal);
+    }
+
+    __aicore__ inline void CopyInW(int32_t progress, int32_t numElements = W_IN_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
+        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + progress * W_IN_TILE_NUM_ELEMENTS], numElements);
+        inQueueW_.EnQue(wLocal);
+    }
+
+    __aicore__ inline void ScaleOutput(int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
+        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
+        AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
+        Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
+        pipe_barrier(PIPE_V);
+        inQueueY_.FreeTensor(yInLocal);
+
+        Add(yLocal, yLocal, yInLocalFP32, numElements);
+        pipe_barrier(PIPE_V);
+
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
+        Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
+        pipe_barrier(PIPE_V);
+
+        outQueueY_.EnQue<Y_T>(yOutLocal);
+    }
+
+    __aicore__ inline void Compute(int32_t progress,
+                                   int32_t blockReduceRepeatCount=BLOCK_REDUCE_NUM_REPEATS,
+                                   int32_t pairReduceRepeat16=PAIR_REDUCE_NUM_REPEATS_16,
+                                   int32_t pairReduceRepeat32=PAIR_REDUCE_NUM_REPEATS_32)
+    {
+        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
+        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
+        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
+
+        Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
+        pipe_barrier(PIPE_V);
+        inQueueW_.FreeTensor(wLocal);
+
+        Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
+        pipe_barrier(PIPE_V);
+
+        if (maxLoRARank_ == LORA_RANK_8) {
+            BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_16) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_32) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_64) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        }
+    }
+
+    __aicore__ inline void CopyOut(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
+        DataCopy(yOutGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], yOutLocal, numElements);
+        outQueueY_.FreeTensor(yOutLocal);
+    }
+
+private:
+    AscendC::TPipe* pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY_, inQueueW_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueY_;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferW_, dupBufferX_, inBufferY_, tmpBufferY_;
+    AscendC::GlobalTensor<X_T> xGm_;
+    AscendC::GlobalTensor<W_T> wGm_;
+    AscendC::GlobalTensor<Y_T> yInGm_;
+    AscendC::GlobalTensor<Y_T> yOutGm_;
+    AscendC::GlobalTensor<int64_t> indicesGm_;
+    uint32_t batchSize_;
+    uint32_t numTokensPerCore_;
+    uint32_t maxLoRARank_;
+    uint32_t outputHiddenDim_;
+    uint32_t sliceOffset_;
+    uint32_t outputFullDim_;
+    uint32_t singleLoRAWeightLen_;
+    int64_t reqLoRAIndex_;
+    uint64_t reqLoRAWeightOffset_;
+    uint32_t numOutputElementsPerInputTile_;
+    uint32_t numStreamInPerOutputTile_;
+    uint64_t yOffset_;
+
+    // The block stride is set to 1, and 8 blocks in the same repeat are processed continuously.
+    // The repeat stride is 8, so the vector unit reads 8 consecutive blocks in the first repeat,
+    // reads next 8 consecutive blocks in the second repeat.
+    AscendC::UnaryRepeatParams castParams_ = {1, 1, 8, 4};
+
+    // For each repeat in BlockReduceSum and PairReduceSum we should move forward only one block,
+    // so we set dstRepStride = 1
+    AscendC::UnaryRepeatParams reduceSumParams_ = {1, 1, 1, 8};
+
+    // When the repeat stride is 0, the vector unit repeatedly reads and computes the first 8 consecutive blocks.
+    // For xDup we repeatedly use it, so we set src0RepStride = 0
+    AscendC::BinaryRepeatParams dotProductParams_ = {1, 1, 1, 8, 0, 8};
+
+};
+
+#define BGMV_EXPAND_TYPE_DECLARE(TYPE)                                                                                 \
+    extern "C" __global__ __aicore__ void bgmv_expand_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\
+                                                             __gm__ void* yIn,  __gm__ void* yOut,                     \
+                                                             uint32_t batchSize, uint32_t numTokensPerCore,            \
+                                                             uint32_t maxLoRARank, uint32_t outputHiddenDim,           \
+                                                             uint32_t sliceOffset, uint32_t outputFullDim)             \
+    {                                                                                                                  \
+        AscendC::TPipe pipe;                                                                                           \
+        BGMVExpand<TYPE> op(&pipe);                                                                                    \
+        op.Init(x, weight, indices, yIn, yOut, batchSize, numTokensPerCore, maxLoRARank,                               \
+                outputHiddenDim, sliceOffset, outputFullDim);                                                          \
+        op.Process();                                                                                                  \
+    }
+
+// declare all dtype kernel
+BGMV_EXPAND_TYPE_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    BGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+extern void bgmv_expand_impl(AscendType type, void* stream, void* x, void* weight, void* indices,
+                             void* yIn, void* yOut, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
+                             uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
+{
+    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
+    if (type == AscendType::FP16) {
+        bgmv_expand_half<<<blockDim, nullptr, stream>>>(x, weight, indices, yIn, yOut, batchSize, numTokensPerCore,
+                                                        maxLoRARank, outputHiddenDim, sliceOffset, outputFullDim);
+    } else if (type == AscendType::BF16) {
+        #if (__CCE_AICORE__ >= 220)
+            bgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, yIn, yOut, batchSize,
+                                                                  numTokensPerCore, maxLoRARank, outputHiddenDim,
+                                                                  sliceOffset, outputFullDim);
+        #endif
+    } else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
\ No newline at end of file
diff --git a/csrc/kernels/bgmv_shrink.cpp b/csrc/kernels/bgmv_shrink.cpp
new file mode 100644
index 00000000000..ae73eb73aac
--- /dev/null
+++ b/csrc/kernels/bgmv_shrink.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include "types.h"
+
+template <typename scalar_t>
+class BGMVShrink {
+public:
+    using X_T = scalar_t;
+    using W_T = scalar_t;
+    using Y_T = float;
+
+    static constexpr uint64_t BUFFER_NUM = 1;
+    static constexpr uint64_t TILE_LENGTH = 11776;  // optimal performance tile length
+
+public:
+    __aicore__ inline BGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {}
+    __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *indices, __gm__ void *y,
+                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
+                                uint32_t maxLoRARank, float scale)
+    {
+        batchSize_ =  batchSize;
+        numTokensPerCore_ = numTokensPerCore;
+        inputHiddenDim_ = inputHiddenDim;
+        maxLoRARank_ = maxLoRARank;
+        scale_ = scale;
+        singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_;
+        incremental_ = inputHiddenDim_ > TILE_LENGTH;
+
+        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
+        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y);
+        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
+        indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices);
+
+        pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T));
+        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T));
+        pipe_->InitBuffer(tmpBufferX_, TILE_LENGTH * sizeof(float));
+        pipe_->InitBuffer(tmpBufferW_, TILE_LENGTH * sizeof(float));
+        
+        pipe_->InitBuffer(outQueueY_, 1, maxLoRARank_ * sizeof(Y_T));
+        pipe_->InitBuffer(outBufferY_, maxLoRARank_ * sizeof(float));
+    }
+
+    __aicore__ inline void Process()
+    {
+        int64_t blockIdx = AscendC::GetBlockIdx();
+        int64_t startIdx = blockIdx * numTokensPerCore_;
+        int64_t endIdx = startIdx + numTokensPerCore_;
+        if (endIdx > batchSize_) {
+            endIdx = batchSize_;
+        }
+        for (int64_t idx = startIdx; idx < endIdx; idx++) {
+            // set up LoRA index
+            CopyInIndex(idx);
+            if (reqLoRAIndex_ < 0) {
+                continue;
+            }
+            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
+
+            if (incremental_) {
+                ProcessImpl<true>(idx);
+            } else {
+                ProcessImpl<false>(idx);
+            }
+
+            ScaleOutput();
+            CopyOut(idx);
+        }
+    }
+
+private:
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void ProcessImpl(const int64_t idx)
+    {
+        AscendC::LocalTensor<float> yOutLocal = outBufferY_.Get<float>();
+        if constexpr (!INCREMENTAL_MODE) {
+            CopyInX(idx, 0, inputHiddenDim_);
+            AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
+            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
+            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
+            pipe_barrier(PIPE_V);
+            inQueueX_.FreeTensor(xLocal);
+        }
+
+        for (int i = 0; i < maxLoRARank_; i++) {
+            float acc(0);
+            for (int32_t j = 0; j < inputHiddenDim_ / TILE_LENGTH; j++) {
+                if constexpr (INCREMENTAL_MODE) {
+                    CopyInX(idx, j);
+                }
+                CopyInW(i, j);
+                Compute<INCREMENTAL_MODE>(acc);
+            }
+            CopyAndComputeLastIteration<INCREMENTAL_MODE>(idx, i, acc);
+            yOutLocal.SetValue(i, acc);
+        }
+    }
+
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // look up the LoRA index
+        reqLoRAIndex_ = indicesGm_.GetValue(idx);
+    }
+
+    __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
+        DataCopy(xLocal, xGm_[inputHiddenDim_ * idx + colIdx * TILE_LENGTH], numElements);
+        inQueueX_.EnQue(xLocal);
+    }
+
+    __aicore__ inline void CopyInW(int32_t rowIdx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
+        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + rowIdx * inputHiddenDim_ + colIdx * TILE_LENGTH], numElements);
+        inQueueW_.EnQue(wLocal);
+    }
+
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void Compute(float &acc, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
+        AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
+        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
+
+        if constexpr (INCREMENTAL_MODE) {
+            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
+            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            pipe_barrier(PIPE_V);
+            inQueueX_.FreeTensor(xLocal);
+            inQueueW_.FreeTensor(wLocal);
+        } else {
+            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            pipe_barrier(PIPE_V);
+            inQueueW_.FreeTensor(wLocal);
+        }
+        // dot product of the one tile of X and W 
+        Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
+        pipe_barrier(PIPE_V);
+        // reduce sum generate one number, which is the summation of all the dot product
+        ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
+        pipe_barrier(PIPE_V);
+
+        acc += wTmpTensor.GetValue(0);
+    }
+
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void CopyAndComputeLastIteration(const int64_t idx, int32_t rowIdx, float &acc)
+    {
+        int32_t colIdx = inputHiddenDim_ / TILE_LENGTH;
+        int32_t remaining = inputHiddenDim_ % TILE_LENGTH;
+        if (remaining == 0) {
+            return;
+        }
+        if constexpr (INCREMENTAL_MODE) {
+            CopyInX(idx, colIdx, remaining);
+        }
+        CopyInW(rowIdx, colIdx, remaining);
+        Compute<INCREMENTAL_MODE>(acc, remaining);
+    }
+
+    __aicore__ inline void ScaleOutput()
+    {
+        AscendC::LocalTensor<float> yLocal = outBufferY_.Get<float>();
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
+
+        Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
+        pipe_barrier(PIPE_V);
+
+        outQueueY_.EnQue<Y_T>(yOutLocal);
+    }
+
+    __aicore__ inline void CopyOut(const int64_t idx)
+    {
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
+        DataCopy(yOutGm_[maxLoRARank_ * idx], yOutLocal, maxLoRARank_);
+        outQueueY_.FreeTensor(yOutLocal);
+    }
+
+private:
+    AscendC::TPipe *pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX_, inQueueW_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueY_;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferX_, tmpBufferW_, outBufferY_;
+    AscendC::GlobalTensor<X_T> xGm_;
+    AscendC::GlobalTensor<W_T> wGm_;
+    AscendC::GlobalTensor<int64_t> indicesGm_;
+    AscendC::GlobalTensor<Y_T> yOutGm_;
+    uint32_t batchSize_;
+    uint32_t numTokensPerCore_;
+    uint32_t inputHiddenDim_;
+    uint32_t maxLoRARank_;
+    float scale_;
+    uint32_t singleLoRAWeightLen_;
+    int64_t reqLoRAIndex_;
+    uint64_t reqLoRAWeightOffset_;
+    bool incremental_;
+};
+
+#define BGMV_SHRINK_TYPE_DECLARE(TYPE)                                                                                 \
+    extern "C" __global__ __aicore__ void bgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\
+                                                             __gm__ void* y, uint32_t batchSize,                       \
+                                                             uint32_t numTokensPerCore, uint32_t inputHiddenDim,       \
+                                                             uint32_t maxLoRARank, float scale)                        \
+    {                                                                                                                  \
+        AscendC::TPipe pipe;                                                                                           \
+        BGMVShrink<TYPE> op(&pipe);                                                                                    \
+        op.Init(x, weight, indices, y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale);            \
+        op.Process();                                                                                                  \
+    }
+
+// declare all dtype kernel
+BGMV_SHRINK_TYPE_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    BGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+extern void bgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, void* indices,
+                             void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
+                             uint32_t maxLoRARank, float scale)
+{
+    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
+    if (type == AscendType::FP16) {
+        bgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, indices, y, batchSize, numTokensPerCore, 
+                                                        inputHiddenDim, maxLoRARank, scale);
+    } else if (type == AscendType::BF16) {
+        #if (__CCE_AICORE__ >= 220)
+            bgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, y, batchSize, numTokensPerCore, 
+                                                                  inputHiddenDim, maxLoRARank, scale);
+        #endif
+    } else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
\ No newline at end of file
diff --git a/csrc/ops.h b/csrc/ops.h
index 79dff893dd2..fff69bcc3aa 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -60,4 +60,32 @@ namespace vllm_ascend {
     auto new_tensor = at_npu::native::from_blob(data_ptr, sizes, strides, options);
     return new_tensor;
   }
+
+  extern void bgmv_shrink_impl(
+        AscendType type,
+        void *stream,
+        void *x,
+        void *weight,
+        void *indices,
+        void *y, 
+        uint32_t batch_size,
+        uint32_t num_tokens_per_core,
+        uint32_t input_hidden_dim,
+        uint32_t lora_rank,
+        float scale);
+
+    extern void bgmv_expand_impl(
+        AscendType type,
+        void *stream,
+        void *x,
+        void *weight,
+        void *indices,
+        void *y,
+        void *y_out,
+        uint32_t batch_size,
+        uint32_t num_tokens_per_core,
+        uint32_t lora_rank,
+        uint32_t output_hidden_dim,
+        uint32_t slice_offset,
+        uint32_t output_full_dim);
 }
diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp
index 7affe839989..f2a0d1f5de6 100644
--- a/csrc/torch_binding.cpp
+++ b/csrc/torch_binding.cpp
@@ -199,6 +199,90 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
     cmd.Run();
     return {masked_input, mask};
 }
+
+void bgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &indices, at::Tensor &y, double scale)
+{
+    at::ScalarType scalar_type = x.scalar_type();
+    TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16");
+    TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]");
+    TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4,
+                "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]");
+    TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]");
+    TORCH_CHECK(indices.dim() == 1, "indices should be [batch_size]");
+    TORCH_CHECK(x.size(0) == y.size(0) && x.size(0) == indices.size(0),
+                "the first dimension of x, y, indices should be same");
+    TORCH_CHECK(x.size(1) > y.size(1), "hidden in should be greater than hidden out");
+    void* x_ptr = x.data_ptr();
+    void* weight_ptr = weight.data_ptr();
+    void* indices_ptr = indices.data_ptr();
+    void* y_ptr = y.data_ptr();
+    int batch_size = x.size(0);
+    int input_hidden_token = x.size(1);
+    uint32_t lora_rank = y.size(1);
+    float scale_f = static_cast<float>(scale);
+    aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
+    at_npu::native::OpCommand cmd;
+    cmd.Name("bgmv_shrink");
+    cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, indices_ptr, y_ptr, batch_size, input_hidden_token,
+                          lora_rank, scale_f]() -> int {
+        auto dtype = get_dtype_from_torch(scalar_type);
+        int device_id = 0;
+        int64_t aiv_num = 0;
+        TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS);
+        int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
+        TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
+        bgmv_shrink_impl(dtype, stream, x_ptr, weight_ptr, indices_ptr, y_ptr, batch_size, num_tokens_per_core,
+                         input_hidden_token, lora_rank, scale_f);
+        return 0;
+    });
+    cmd.Run();
+    return;
+}
+
+at::Tensor bgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &indices, at::Tensor &y,
+                       int64_t slice_offset, int64_t slice_size)
+{
+    at::ScalarType scalar_type = y.scalar_type();
+    TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16");
+    TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]");
+    TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4,
+                "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]");
+    TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]");
+    TORCH_CHECK(indices.dim() == 1, "indices should be [batch_size]");
+    TORCH_CHECK(x.size(0) == y.size(0) && x.size(0) == indices.size(0),
+                "the first dimension of x, y, indices should be same");
+    TORCH_CHECK(x.size(1) <= slice_size, "hidden in should be smaller than hidden out");
+    TORCH_CHECK(slice_offset >= 0, "slice offset should be no smaller than 0");
+    TORCH_CHECK((slice_size + slice_offset) <= y.size(1),
+                "slice_size + slice_offset should be smaller than the second dimension of y")
+
+    at::Tensor y_out = y;
+    void* x_ptr = x.data_ptr();
+    void* weight_ptr = weight.data_ptr();
+    void* indices_ptr = indices.data_ptr();
+    void* y_ptr = y.data_ptr();
+    void* y_out_ptr = y_out.data_ptr();
+    int batch_size = x.size(0);
+    int lora_rank = x.size(1);
+    int output_full_dim = y.size(1);
+    aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
+    at_npu::native::OpCommand cmd;
+    cmd.Name("bgmv_expand");
+    cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, indices_ptr, y_ptr, y_out_ptr, batch_size, lora_rank,
+                          slice_offset, slice_size, output_full_dim]() -> int {
+        auto dtype = get_dtype_from_torch(scalar_type);
+        int device_id = 0;
+        int64_t aiv_num = 0;
+        TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS);
+        int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
+        TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
+        bgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, indices_ptr, y_ptr, y_out_ptr, batch_size,
+                         num_tokens_per_core, lora_rank, slice_size, slice_offset, output_full_dim);
+        return 0;
+    });
+    cmd.Run();
+    return y_out;
+}
 } // namespace vllm_ascend
 
 TORCH_LIBRARY_EXPAND(_C, ops)
@@ -223,6 +307,14 @@ TORCH_LIBRARY_EXPAND(_C, ops)
         "                         int added_vocab_start_index, "
         "                         int added_vocab_end_index) -> (Tensor masked_input, Tensor mask)");
     ops.impl("get_masked_input_and_mask", torch::kPrivateUse1, &vllm_ascend::get_masked_input_and_mask);
+
+    ops.def("bgmv_shrink(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y, float scale) -> ()");
+    ops.impl("bgmv_shrink", torch::kPrivateUse1, &vllm_ascend::bgmv_shrink);
+
+    ops.def(
+        "bgmv_expand(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y,"
+        "            int slice_offset, int slice_size) -> Tensor");
+    ops.impl("bgmv_expand", torch::kPrivateUse1, &vllm_ascend::bgmv_expand);
 }
 
 REGISTER_EXTENSION(_C)
diff --git a/tests/e2e/singlecard/ops/test_bgmv_expand.py b/tests/e2e/singlecard/ops/test_bgmv_expand.py
new file mode 100644
index 00000000000..5a6b187f1ed
--- /dev/null
+++ b/tests/e2e/singlecard/ops/test_bgmv_expand.py
@@ -0,0 +1,41 @@
+import torch
+
+from vllm_ascend.utils import enable_custom_op
+
+enable_custom_op()
+
+DEFAULT_ATOL = 1e-3
+DEFAULT_RTOL = 1e-3
+
+
+def bgmv_expand_cpu_impl(x: torch.Tensor, w: torch.Tensor,
+                         indices: torch.Tensor, y: torch.tensor,
+                         slice_offset: int, slice_size: int) -> torch.Tensor:
+    W = w[indices, :, :].transpose(-1, -2).to(torch.float32)
+    z = torch.bmm(x.unsqueeze(1).to(torch.float32), W).squeeze()
+    y[:, slice_offset:slice_offset + slice_size] += z
+    return y
+
+
+@torch.inference_mode()
+def test_bgmv_expand() -> None:
+    B = 1
+    x = torch.randn([B, 16], dtype=torch.float)
+    w = torch.randn([64, 128, 16], dtype=torch.float16)
+    indices = torch.zeros([B], dtype=torch.int64)
+    y = torch.randn([B, 128 * 3], dtype=torch.float16)
+
+    x_npu = x.npu()
+    w_npu = w.npu()
+    indices_npu = indices.npu()
+    y_npu = y.npu()
+
+    y_out = bgmv_expand_cpu_impl(x, w, indices, y, 0, 128)
+    y_out_npu = torch.ops._C.bgmv_expand(x_npu, w_npu, indices_npu, y_npu, 0,
+                                         128)
+
+    # Compare the results.
+    torch.testing.assert_close(y_out_npu.cpu(),
+                               y_out,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
diff --git a/tests/e2e/singlecard/ops/test_bgmv_shrink.py b/tests/e2e/singlecard/ops/test_bgmv_shrink.py
new file mode 100644
index 00000000000..6888b6eba88
--- /dev/null
+++ b/tests/e2e/singlecard/ops/test_bgmv_shrink.py
@@ -0,0 +1,40 @@
+import torch
+
+from vllm_ascend.utils import enable_custom_op
+
+enable_custom_op()
+
+DEFAULT_ATOL = 1e-3
+DEFAULT_RTOL = 1e-3
+
+
+def bgmv_shrink_cpu_impl(x: torch.Tensor, w: torch.Tensor,
+                         indices: torch.Tensor, y: torch.tensor,
+                         scaling: float) -> torch.Tensor:
+    W = w[indices, :, :].transpose(-1, -2).to(torch.float32)
+    z = torch.bmm(x.unsqueeze(1).to(torch.float32), W).squeeze()
+    y[:, :] += z * scaling
+    return y
+
+
+@torch.inference_mode()
+def test_bgmv_shrink() -> None:
+    B = 1
+    x = torch.randn([B, 128], dtype=torch.float16)
+    w = torch.randn([64, 16, 128], dtype=torch.float16)
+    indices = torch.zeros([B], dtype=torch.int64)
+    y = torch.zeros([B, 16])
+
+    x_npu = x.npu()
+    w_npu = w.npu()
+    indices_npu = indices.npu()
+    y_npu = y.npu()
+
+    y = bgmv_shrink_cpu_impl(x, w, indices, y, 0.5)
+    torch.ops._C.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
+
+    # Compare the results.
+    torch.testing.assert_close(y_npu.cpu(),
+                               y,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
diff --git a/vllm_ascend/lora/punica_wrapper/lora_ops.py b/vllm_ascend/lora/punica_wrapper/lora_ops.py
new file mode 100644
index 00000000000..dd66937aae2
--- /dev/null
+++ b/vllm_ascend/lora/punica_wrapper/lora_ops.py
@@ -0,0 +1,112 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+def bgmv_shrink(inputs: torch.Tensor,
+                lora_a_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                scaling: float = 1.0):
+    return torch.ops._C.bgmv_shrink(
+        inputs,
+        lora_a_weights,
+        lora_indices_tensor,
+        output_tensor,
+        scaling,
+    )
+
+
+def bgmv_expand(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                add_inputs: bool = True):
+    return torch.ops._C.bgmv_expand(
+        inputs,
+        lora_b_weights,
+        lora_indices_tensor,
+        output_tensor,
+        0,
+        output_tensor.size(1),
+    )
+
+
+def bgmv_expand_slice(inputs: torch.Tensor,
+                      lora_b_weights: torch.Tensor,
+                      output_tensor: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor,
+                      slice_offset: int,
+                      slice_size: int,
+                      add_inputs: bool = True):
+    return torch.ops._C.bgmv_expand(inputs, lora_b_weights,
+                                    lora_indices_tensor, output_tensor,
+                                    slice_offset, slice_size)
+
+
+def sgmv_shrink(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    token_nums: int,
+    scaling: float,
+):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_shrink(inputs, lora_a_weights, output_tensor, exploded_indices,
+                scaling)
+
+
+def sgmv_expand(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                b_seq_start_loc: torch.Tensor,
+                seq_len_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                batches: int,
+                max_seq_length: int,
+                token_nums: int,
+                add_inputs: bool = False):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_expand(inputs, lora_b_weights, output_tensor, exploded_indices,
+                add_inputs)
+
+
+def sgmv_expand_slice(inputs: torch.Tensor,
+                      lora_b_weights: torch.Tensor,
+                      output_tensor: torch.Tensor,
+                      b_seq_start_loc: torch.Tensor,
+                      seq_len_tensor: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor,
+                      batches: int,
+                      max_seq_length: int,
+                      token_nums: int,
+                      slice_offset: int,
+                      slice_size: int,
+                      add_inputs: bool = False):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_expand_slice(inputs, lora_b_weights, output_tensor, exploded_indices,
+                      slice_offset, slice_size, add_inputs)
diff --git a/vllm_ascend/lora/punica_wrapper/punica_npu.py b/vllm_ascend/lora/punica_wrapper/punica_npu.py
index 339ed364d5b..9ca747b2d9a 100644
--- a/vllm_ascend/lora/punica_wrapper/punica_npu.py
+++ b/vllm_ascend/lora/punica_wrapper/punica_npu.py
@@ -3,9 +3,18 @@
 from typing import Callable, Optional, Tuple, Union
 
 import torch
-from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-                                     bgmv_shrink, sgmv_expand,
-                                     sgmv_expand_slice, sgmv_shrink)
+
+from vllm_ascend.utils import is_310p
+
+if is_310p():
+    from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
+                                         bgmv_shrink, sgmv_expand,
+                                         sgmv_expand_slice, sgmv_shrink)
+else:
+    from vllm_ascend.lora.punica_wrapper.lora_ops import (
+        bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand,
+        sgmv_expand_slice, sgmv_shrink)
+
 from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
 
 

From 1bcfe57b1e9385e14fc67c935cf03928c21a216d Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Tue, 29 Jul 2025 19:36:34 +0800
Subject: [PATCH 15/56] [Doc] Add performance tuning doc to main (#1392)

### What this PR does / why we need it?
Add performance tuning doc to main.

Closes: https://github.com/vllm-project/vllm-ascend/issues/1387

- vLLM version: v0.9.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/923147b5e8551887fd64a0fc242c361d5216e1d7

---------

Signed-off-by: shen-shanshan <467638484@qq.com>
Signed-off-by: Shanshan Shen <87969357+shen-shanshan@users.noreply.github.com>
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 .../developer_guide/performance/index.md      |   1 +
 .../performance/optimization_and_tuning.md    | 183 ++++++++++++++++++
 2 files changed, 184 insertions(+)
 create mode 100644 docs/source/developer_guide/performance/optimization_and_tuning.md

diff --git a/docs/source/developer_guide/performance/index.md b/docs/source/developer_guide/performance/index.md
index 7f0f9f120ea..0fa14667aa1 100644
--- a/docs/source/developer_guide/performance/index.md
+++ b/docs/source/developer_guide/performance/index.md
@@ -5,4 +5,5 @@
 :maxdepth: 1
 performance_benchmark
 profile_execute_duration
+optimization_and_tuning
 :::
diff --git a/docs/source/developer_guide/performance/optimization_and_tuning.md b/docs/source/developer_guide/performance/optimization_and_tuning.md
new file mode 100644
index 00000000000..61e761abed1
--- /dev/null
+++ b/docs/source/developer_guide/performance/optimization_and_tuning.md
@@ -0,0 +1,183 @@
+# Optimization and Tuning
+
+This guide aims to help users to improve vllm-ascend performance on system level. It includes OS configuration, library optimization, deploy guide and so on. Any feedback is welcome.
+
+## Preparation
+
+Run the container:
+
+```{code-block} bash
+   :substitutions:
+# Update DEVICE according to your device (/dev/davinci[0-7])
+export DEVICE=/dev/davinci0
+# Update the cann base image
+export IMAGE=m.daocloud.io/quay.io/ascend/cann:|cann_image_tag|
+docker run --rm \
+--name performance-test \
+--device $DEVICE \
+--device /dev/davinci_manager \
+--device /dev/devmm_svm \
+--device /dev/hisi_hdc \
+-v /usr/local/dcmi:/usr/local/dcmi \
+-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+-v /etc/ascend_install.info:/etc/ascend_install.info \
+-v /root/.cache:/root/.cache \
+-it $IMAGE bash
+```
+
+Configure your environment:
+
+```{code-block} bash
+   :substitutions:
+# Configure the mirror
+echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy main restricted universe multiverse" > /etc/apt/sources.list && \
+echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy main restricted universe multiverse" >> /etc/apt/sources.list && \
+echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
+echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
+echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-backports main restricted universe multiverse" >> /etc/apt/sources.list && \
+echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-backports main restricted universe multiverse" >> /etc/apt/sources.list && \
+echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list && \
+echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list
+
+# Install os packages
+apt update && apt install wget gcc g++ libnuma-dev git vim -y
+```
+
+Install vllm and vllm-ascend:
+
+```{code-block} bash
+   :substitutions:
+# Install necessary dependencies
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+pip install modelscope pandas datasets gevent sacrebleu rouge_score pybind11 pytest
+
+# Configure this var to speed up model download
+VLLM_USE_MODELSCOPE=true
+```
+
+Please follow the [Installation Guide](https://vllm-ascend.readthedocs.io/en/latest/installation.html) to make sure vllm, vllm-ascend and mindie-turbo is installed correctly.
+
+:::{note}
+Make sure your vllm and vllm-ascend are installed after your python configuration completed, because these packages will build binary files using the python in current environment. If you install vllm, vllm-ascend and mindie-turbo before chapter 1.1, the binary files will not use the optimized python.
+:::
+
+## Optimizations
+
+### 1. Compilation Optimization
+
+#### 1.1. Install optimized `python`
+
+Python supports **LTO** and **PGO** optimization starting from version `3.6` and above, which can be enabled at compile time. And we have offered compilation optimized `python` packages directly to users for the sake of convenience. You can also reproduce the `python` build follow this [tutorial](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0063.html) according to your specific scenarios.
+
+```{code-block} bash
+   :substitutions:
+mkdir -p /workspace/tmp
+cd /workspace/tmp
+
+# Download prebuilt lib and packages
+wget https://repo.oepkgs.net/ascend/pytorch/vllm/lib/libcrypto.so.1.1
+wget https://repo.oepkgs.net/ascend/pytorch/vllm/lib/libomp.so
+wget https://repo.oepkgs.net/ascend/pytorch/vllm/lib/libssl.so.1.1
+wget https://repo.oepkgs.net/ascend/pytorch/vllm/python/py311_bisheng.tar.gz
+
+# Configure python and pip
+cp ./*.so* /usr/local/lib
+tar -zxvf ./py311_bisheng.*  -C /usr/local/
+mv  /usr/local/py311_bisheng/  /usr/local/python
+sed -i "1c#\!/usr/local/python/bin/python3.11" /usr/local/python/bin/pip3
+sed -i "1c#\!/usr/local/python/bin/python3.11" /usr/local/python/bin/pip3.11
+ln -sf  /usr/local/python/bin/python3  /usr/bin/python
+ln -sf  /usr/local/python/bin/python3  /usr/bin/python3
+ln -sf  /usr/local/python/bin/python3.11  /usr/bin/python3.11
+ln -sf  /usr/local/python/bin/pip3  /usr/bin/pip3
+ln -sf  /usr/local/python/bin/pip3  /usr/bin/pip
+
+export PATH=/usr/bin:/usr/local/python/bin:$PATH
+```
+
+### 2. OS Optimization
+
+#### 2.1. jemalloc
+
+**jemalloc** is a memory allocator that improves performance for multi-threads scenario and can reduce memory fragment. jemalloc use thread local memory manager to allocate variables, which can avoid lock competition between multi-threads and can hugely optimize performance.
+
+```{code-block} bash
+   :substitutions:
+# Install jemalloc
+sudo apt update
+sudo apt install libjemalloc2
+
+# Configure jemalloc
+export LD_PRELOAD=/usr/lib/"$(uname -i)"-linux-gnu/libjemalloc.so.2 $LD_PRELOAD
+```
+
+#### 2.2. Tcmalloc
+
+**Tcmalloc (Thread Counting Malloc)** is a universal memory allocator that improves overall performance while ensuring low latency by introducing a multi-level cache structure, reducing mutex competition and optimizing large object processing flow. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/700/ptmoddevg/trainingmigrguide/performance_tuning_0068.html).
+
+```{code-block} bash
+   :substitutions:
+# Install tcmalloc
+sudo apt update
+sudo apt install libgoogle-perftools4 libgoogle-perftools-dev
+
+# Get the location of libtcmalloc.so*
+find /usr -name libtcmalloc.so*
+
+# Make the priority of tcmalloc higher
+# The <path> is the location of libtcmalloc.so we get from the upper command
+# Example: "$LD_PRELOAD:/usr/lib/aarch64-linux-gnu/libtcmalloc.so"
+export LD_PRELOAD="$LD_PRELOAD:<path>"
+
+# Verify your configuration
+# The path of libtcmalloc.so will be contained in the result if your configuration is valid
+ldd `which python`
+```
+
+### 3. `torch_npu` Optimization
+
+Some performance tuning features in `torch_npu` are controlled by environment variables. Some features and their related environment variables are shown below.
+
+Memory optimization:
+
+```{code-block} bash
+   :substitutions:
+# Upper limit of memory block splitting allowed (MB), Setting this parameter can prevent large memory blocks from being split.
+export PYTORCH_NPU_ALLOC_CONF="max_split_size_mb:250"
+
+# When operators on the communication stream have dependencies, they all need to be ended before being released for reuse. The logic of multi-stream reuse is to release the memory on the communication stream in advance so that the computing stream can be reused.
+export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
+```
+
+Schedule optimization:
+
+```{code-block} bash
+   :substitutions:
+# Optimize operator delivery queue, this will affect the memory peak value, and may degrade if the memory is tight.
+export TASK_QUEUE_ENABLE=2
+
+# This will greatly improve the CPU bottleneck model and ensure the same performance for the NPU bottleneck model.
+export CPU_AFFINITY_CONF=1
+```
+
+### 4. CANN Optimization
+
+#### 4.1. HCCL Optimization
+
+There are some performance tuning features in HCCL, which are controlled by environment variables.
+
+You can configure HCCL to use "AIV" mode to optimize performance by setting the environment variable shown below. In "AIV" mode, the communication is scheduled by AI vector core directly with ROCE, instead of being scheduled by AI cpu.
+
+```{code-block} bash
+   :substitutions:
+export HCCL_OP_EXPANSION_MODE="AIV"
+```
+
+Plus, there are more features for performance optimization in specific scenarios, which are shown below.
+
+- `HCCL_INTRA_ROCE_ENABLE`: Use RDMA link instead of SDMA link between two 8Ps as the mesh interconnect link, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0044.html).
+- `HCCL_RDMA_TC`: Use this var to configure traffic class of RDMA network card, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0045.html).
+- `HCCL_RDMA_SL`: Use this var to configure service level of RDMA network card, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0046.html).
+- `HCCL_BUFFSIZE`: Use this var to control the cache size for sharing data between two NPUs, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0047.html).

From 18adb9d7d2968ae0b74e0d1cc78565205c2df7b2 Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Tue, 29 Jul 2025 19:38:30 +0800
Subject: [PATCH 16/56] [e2e]Fixed the issue that pyhccl e2e cannot run
 continuously with other tests (#1246)

### What this PR does / why we need it?
1.Fixed the issue that pyhccl e2e cannot run continuously with other
tests.
2.Cleaned up the resources occupied by the dynamic_npugraph_batchsize
e2e test.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
This is a e2e test

e2e multi-cards tests local running successfully.

- vLLM version: v0.9.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/0df4d9b06b15fa39eeb2d440e7742da93afd5e6c

Signed-off-by: leo-pony <nengjunma@outlook.com>
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 .../test_dynamic_npugraph_batchsize.py        | 36 +++++------
 .../e2e/multicard/test_pyhccl_distributed.py  | 59 +++++++++++--------
 2 files changed, 54 insertions(+), 41 deletions(-)

diff --git a/tests/e2e/multicard/test_dynamic_npugraph_batchsize.py b/tests/e2e/multicard/test_dynamic_npugraph_batchsize.py
index e5c7042b1ec..8d0ad4911e2 100644
--- a/tests/e2e/multicard/test_dynamic_npugraph_batchsize.py
+++ b/tests/e2e/multicard/test_dynamic_npugraph_batchsize.py
@@ -16,7 +16,9 @@
 #
 import pytest
 import torch
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner
 
 MODELS = [
     "Qwen/Qwen2.5-0.5B-Instruct",
@@ -38,20 +40,20 @@
 def test_models(model: str, tp_size: int, max_tokens: int, temperature: int,
                 ignore_eos: bool) -> None:
     # Create an LLM.
-    llm = LLM(
-        model=model,
-        tensor_parallel_size=tp_size,
-    )
-    # Prepare sampling_parames
-    sampling_params = SamplingParams(
-        max_tokens=max_tokens,
-        temperature=temperature,
-        ignore_eos=ignore_eos,
-    )
+    with VllmRunner(
+            model_name=model,
+            tensor_parallel_size=tp_size,
+    ) as vllm_model:
+        # Prepare sampling_parames
+        sampling_params = SamplingParams(
+            max_tokens=max_tokens,
+            temperature=temperature,
+            ignore_eos=ignore_eos,
+        )
 
-    # Generate texts from the prompts.
-    # The output is a list of RequestOutput objects
-    outputs = llm.generate(prompts, sampling_params)
-    torch.npu.synchronize()
-    # The output length should be equal to prompts length.
-    assert len(outputs) == len(prompts)
+        # Generate texts from the prompts.
+        # The output is a list of RequestOutput objects
+        outputs = vllm_model.generate(prompts, sampling_params)
+        torch.npu.synchronize()
+        # The output length should be equal to prompts length.
+        assert len(outputs) == len(prompts)
diff --git a/tests/e2e/multicard/test_pyhccl_distributed.py b/tests/e2e/multicard/test_pyhccl_distributed.py
index 42918c92334..e3d9aedf156 100644
--- a/tests/e2e/multicard/test_pyhccl_distributed.py
+++ b/tests/e2e/multicard/test_pyhccl_distributed.py
@@ -24,9 +24,39 @@
                                              init_distributed_environment)
 from vllm.utils import update_environment_variables
 
+from tests.e2e.conftest import cleanup_dist_env_and_memory
 from vllm_ascend.distributed.device_communicators.pyhccl import \
     PyHcclCommunicator
 
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+multiprocessing.set_start_method("spawn", force=True)
+
+
+def _worker_entry(env, fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    update_environment_variables(env)
+
+    rank = int(os.environ['RANK'])
+    local_rank = int(os.environ['LOCAL_RANK'])
+    word_size = int(os.environ['WORLD_SIZE'])
+
+    distributed_init_method = "tcp://localhost:12345"
+
+    device = torch.device(f"npu:{local_rank}")
+    torch.npu.set_device(device)
+
+    init_distributed_environment(
+        world_size=word_size,
+        rank=rank,
+        distributed_init_method=distributed_init_method,
+        local_rank=local_rank,
+        backend="hccl")
+    fn()
+    cleanup_dist_env_and_memory()
+
 
 def distributed_run(fn, world_size):
     number_of_processes = world_size
@@ -37,9 +67,7 @@ def distributed_run(fn, world_size):
         env['LOCAL_RANK'] = str(i)
         env['WORLD_SIZE'] = str(number_of_processes)
         env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
-        env['MASTER_ADDR'] = 'localhost'
-        env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=fn, args=(env, ))
+        p = multiprocessing.Process(target=_worker_entry, args=(env, fn))
         processes.append(p)
         p.start()
 
@@ -50,22 +78,6 @@ def distributed_run(fn, world_size):
         assert p.exitcode == 0
 
 
-def worker_fn_wrapper(fn):
-    # `multiprocessing.Process` cannot accept environment variables directly
-    # so we need to pass the environment variables as arguments
-    # and update the environment variables in the function
-    def wrapped_fn(env):
-        update_environment_variables(env)
-        local_rank = os.environ['LOCAL_RANK']
-        device = torch.device(f"npu:{local_rank}")
-        torch.npu.set_device(device)
-        init_distributed_environment(backend="hccl")
-        fn()
-
-    return wrapped_fn
-
-
-@worker_fn_wrapper
 def worker_fn():
     pynccl_comm = PyHcclCommunicator(get_world_group().cpu_group,
                                      device=get_world_group().device)
@@ -76,11 +88,10 @@ def worker_fn():
     assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
 
 
-# def test_pyhccl():
-#     distributed_run(worker_fn, 2)
+def test_pyhccl():
+    distributed_run(worker_fn, 4)
 
 
-@worker_fn_wrapper
 def broadcast_worker_fn():
     # Test broadcast for every root rank.
     # Essentially this is an all-gather operation.
@@ -106,5 +117,5 @@ def broadcast_worker_fn():
         assert torch.all(recv_tensors[i] == i).cpu().item()
 
 
-# def test_pyhccl_broadcast():
-#     distributed_run(broadcast_worker_fn, 4)
+def test_pyhccl_broadcast():
+    distributed_run(broadcast_worker_fn, 4)

From adb37d01baddd1adb3376337183ed21027a02121 Mon Sep 17 00:00:00 2001
From: whx <56632993+whx-sjtu@users.noreply.github.com>
Date: Tue, 29 Jul 2025 23:53:19 +0800
Subject: [PATCH 17/56] [Perf][MoE] Improve MoE multistream parallel
 performace. (#1891)

This PR designs the shared expert multi-stream parallelism of
w8a8-dynamic-quantized MoE stage in more detail to achieve better
performance.

- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2cc571199b1446f376ee019fcafda19155fc6b71

Signed-off-by: whx-sjtu <2952154980@qq.com>
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/deepseek_v2.py        |   2 +-
 vllm_ascend/ops/fused_moe.py             |  17 ++++
 vllm_ascend/quantization/w8a8_dynamic.py | 119 ++++++++++++++++++++---
 3 files changed, 124 insertions(+), 14 deletions(-)

diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
index 129e5eb3a85..888697219c7 100644
--- a/vllm_ascend/models/deepseek_v2.py
+++ b/vllm_ascend/models/deepseek_v2.py
@@ -393,7 +393,7 @@ def forward(self,
 
         # router_logits: (num_tokens, n_experts)
         router_logits = None
-        if not self.rm_router_logits:
+        if not self.rm_router_logits and not self.enable_multistream_moe:
             router_logits, _ = self.gate(hidden_states)
 
         experts_hidden_states = self.experts(
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index fbe187235f3..cc9e47afc4c 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -1384,6 +1384,21 @@ def forward(self,
         forward_context = get_forward_context()
         fused_moe_state = forward_context.fused_moe_state
         mc2_mask = forward_context.mc2_mask
+        # For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel.
+        quantized_x_for_share, dynamic_scale_for_share = None, None
+        from vllm_ascend.quantization.w8a8_dynamic import \
+            AscendW8A8DynamicFusedMoEMethod
+        if self.enable_multistream_moe:
+            if not self.rm_router_logits:
+                router_logits, _ = gate(hidden_states)
+            if hasattr(self.quant_method, "quant_method") and \
+               isinstance(self.quant_method.quant_method,
+                          AscendW8A8DynamicFusedMoEMethod
+                          ) and fused_moe_state == FusedMoEState.MC2:
+                with npu_stream_switch("moe_secondary", 0):
+                    quantized_x_for_share, dynamic_scale_for_share = torch_npu.npu_dynamic_quant(
+                        hidden_states)
+
         if shared_experts:
             if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:
                 # When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
@@ -1469,6 +1484,8 @@ def forward(self,
             shared_experts=shared_experts if self.torchair_graph_enabled
             and self.enable_multistream_moe and not is_prefill else None,
             mc2_mask=mc2_mask,
+            quantized_x_for_share=quantized_x_for_share,
+            dynamic_scale_for_share=dynamic_scale_for_share,
             token_dispatcher=self.token_dispatcher,
         )
 
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
index f1667d04a57..36549e75860 100644
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -33,6 +33,82 @@
                                dispose_tensor, get_ascend_soc_version)
 
 
+def apply_mlp_decode(hidden_states: torch.Tensor,
+                     w1: torch.Tensor,
+                     w1_scale: torch.Tensor,
+                     w2: torch.Tensor,
+                     w2_scale: torch.Tensor,
+                     group_list: torch.Tensor,
+                     dynamic_scale: torch.Tensor = None,
+                     group_list_type: int = 1) -> torch.Tensor:
+    """
+    apply MLP: gate_up_proj -> swiglu -> down_proj
+    Args:
+        hidden_states_wrapper: wrapper of input hidden states with shape (num_tokens, hidden_size).
+        w1: expert weights1 with shape
+            (num_experts, hidden_size, intermediate_size * 2)
+        w1_scale: weights1 scale with shape (num_experts, intermediate_size * 2)
+        w2: expert weights2 with shape
+            (num_experts, intermediate_size, hidden_size)
+        w2_scale: weights2 scale with shape (num_experts, hidden_size)
+        group_list: number of tokens for each expert, follow cumsum mode, and
+            with shape (num_experts).
+        transpose_weight:
+            w1: (num_experts, intermediate_size * 2, hidden_size) ->
+                    (num_experts, hidden_size, intermediate_size * 2)
+            w2: (num_experts, hidden_size, intermediate_size) ->
+                    (num_experts, intermediate_size, hidden_size)
+    Returns:
+        hidden_states: output hidden states after MLP.
+    """
+
+    if dynamic_scale is None:
+        unquantized_hidden_states = hidden_states
+        hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(
+            hidden_states)
+        # Dispose the original unquantized hidden states
+        # to save npu memory because they're no longer used.
+        dispose_tensor(unquantized_hidden_states)
+    else:
+        pertoken_scale = dynamic_scale
+
+    # gmm1: gate_up_proj
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w1],
+        split_item=3,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=group_list,
+        output_dtype=torch.int32)[0]
+
+    # act_fn: swiglu
+    hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
+        x=hidden_states,
+        weight_scale=w1_scale,
+        activation_scale=pertoken_scale,
+        bias=None,
+        quant_scale=None,
+        quant_offset=None,
+        group_index=group_list,
+        activate_left=True,
+        quant_mode=1,
+    )
+
+    # gmm2: down_proj
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w2],
+        scale=[w2_scale],
+        per_token_scale=[swiglu_out_scale],
+        split_item=2,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=group_list,
+        output_dtype=w2_scale.dtype)[0]
+    return hidden_states
+
+
 def apply_mlp(hidden_states: torch.Tensor,
               w1: torch.Tensor,
               w1_scale: torch.Tensor,
@@ -124,6 +200,8 @@ def fused_experts_with_mc2(
     quantized_x_for_share: Optional[Any] = None,
     dynamic_scale_for_share: Optional[Any] = None,
     mc2_mask: Optional[torch.Tensor] = None,
+    shared_gate_up: Optional[Any] = None,
+    shared_dequant_scale: Optional[Any] = None,
 ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
     assert mc2_mask is not None
     if log2phy is not None:
@@ -186,18 +264,19 @@ def fused_experts_with_mc2(
 
     if shared_experts is not None:
         with npu_stream_switch("moe_secondary", 0):
-            npu_wait_tensor(quantized_x_for_share, expand_x)
+            npu_wait_tensor(shared_gate_up, expand_x)
             shared_act_out = shared_experts.act_fn(
-                (quantized_x_for_share, dynamic_scale_for_share))
+                (shared_gate_up, shared_dequant_scale))
             shared_act, swiglu_out_scale = shared_act_out[0], shared_act_out[1]
 
-    down_out_list = apply_mlp(expand_x,
-                              w1,
-                              w1_scale,
-                              w2,
-                              w2_scale,
-                              expert_token_nums,
-                              dynamic_scale=dynamic_scale)
+    # `expand_x` will be disposed in the `apply_mlp` function
+    down_out_list = apply_mlp_decode(expand_x,
+                                     w1,
+                                     w1_scale,
+                                     w2,
+                                     w2_scale,
+                                     expert_token_nums,
+                                     dynamic_scale=dynamic_scale)
 
     # moeCombine
     kwargs_mc2 = {
@@ -745,6 +824,8 @@ def apply(
         log2phy: torch.Tensor = None,
         global_redundant_expert_num: int = 0,
         shared_experts: Optional[Any] = None,
+        quantized_x_for_share: Optional[Any] = None,
+        dynamic_scale_for_share: Optional[Any] = None,
         **kwargs,
     ) -> torch.Tensor:
         assert router_logits.shape[
@@ -781,6 +862,16 @@ def apply(
                 e_score_correction_bias=e_score_correction_bias,
             )
 
+        fused_moe_state = get_forward_context().fused_moe_state
+        shared_gate_up, shared_dequant_scale = None, None
+        if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
+            with npu_stream_switch("moe_secondary", 0):
+                npu_wait_tensor(quantized_x_for_share, router_logits)
+                share_up_out, _ = shared_experts.gate_up_proj(
+                    (quantized_x_for_share, dynamic_scale_for_share))
+                shared_gate_up, shared_dequant_scale = share_up_out[
+                    0], share_up_out[1]
+
         # this is a naive implementation for experts load balance so as
         # to avoid accumulating too much tokens on a single rank.
         # currently it is only activated when doing profile runs.
@@ -788,8 +879,6 @@ def apply(
             topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
 
         topk_weights = topk_weights.to(x.dtype)
-
-        fused_moe_state = get_forward_context().fused_moe_state
         if fused_moe_state == FusedMoEState.AllGatherEP:
             return fused_experts_with_allgather(
                 hidden_states=x,
@@ -806,7 +895,7 @@ def apply(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
-                w1_scale=layer.w13_weight_scale,
+                w1_scale=layer.w13_weight_scale_fp32,
                 w2_scale=layer.w2_weight_scale,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
@@ -817,7 +906,9 @@ def apply(
                 global_redundant_expert_num=global_redundant_expert_num,
                 shared_experts=shared_experts,
                 is_torchair=self.torchair_graph_enabled,
-                mc2_mask=kwargs.get("mc2_mask", None))
+                mc2_mask=kwargs.get("mc2_mask", None),
+                shared_gate_up=shared_gate_up,
+                shared_dequant_scale=shared_dequant_scale)
         elif fused_moe_state in [
                 FusedMoEState.AllGather, FusedMoEState.NaiveMulticast
         ]:
@@ -860,6 +951,8 @@ def process_weights_after_loading(self, layer):
             torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
         layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
             layer.w13_weight_scale.data.shape[0], -1)
+        layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to(
+            torch.float32)
         layer.w13_weight_offset.data = layer.w13_weight_offset.data.view(
             layer.w13_weight_offset.data.shape[0], -1)
         layer.w2_weight_scale.data = layer.w2_weight_scale.data.view(

From 8c28c2bff971951b5991ef45ca7a1585af202b90 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Wed, 30 Jul 2025 08:47:22 +0800
Subject: [PATCH 18/56] [Refactor]Refactor sampler (#2050)

Refactor Sampler implementation from patch way to inherit from vLLM
Sampler interface.

Next step: Make the op `TopKTopPSampler` in vLLM support custom ops
register mechanism

- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/61a6905ab036fd00eafdb1b0ca130d5feccfe686

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 .../worker/patch_common/test_patch_sampler.py | 46 ----------
 tests/ut/sample/test_sampler.py               | 32 +++++++
 vllm_ascend/envs.py                           |  6 +-
 vllm_ascend/patch/__init__.py                 | 16 +---
 .../patch/worker/patch_common/__init__.py     |  1 -
 .../worker/patch_common/patch_sampler.py      | 83 -------------------
 vllm_ascend/sample/sampler.py                 | 62 ++++++++++++++
 vllm_ascend/worker/model_runner_v1.py         | 12 ++-
 8 files changed, 108 insertions(+), 150 deletions(-)
 delete mode 100644 tests/ut/patch/worker/patch_common/test_patch_sampler.py
 create mode 100644 tests/ut/sample/test_sampler.py
 delete mode 100644 vllm_ascend/patch/worker/patch_common/patch_sampler.py
 create mode 100644 vllm_ascend/sample/sampler.py

diff --git a/tests/ut/patch/worker/patch_common/test_patch_sampler.py b/tests/ut/patch/worker/patch_common/test_patch_sampler.py
deleted file mode 100644
index fc9fbd145ca..00000000000
--- a/tests/ut/patch/worker/patch_common/test_patch_sampler.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-
-import importlib
-import os
-from unittest import mock
-
-import torch
-from vllm.v1.sample.ops import topk_topp_sampler
-
-from tests.ut.base import TestBase
-
-
-class TestTopKTopPSamplerOptimize(TestBase):
-
-    @mock.patch.dict(os.environ,
-                     {"VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION": "1"})
-    @mock.patch("torch_npu.npu_top_k_top_p")
-    def test_npu_topk_topp_called_when_optimized(self, mock_npu_op):
-        # We have to patch and reload because the patch will take effect
-        # only after VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE is set.
-        import vllm_ascend.patch.worker.patch_common.patch_sampler
-        importlib.reload(vllm_ascend.patch.worker.patch_common.patch_sampler)
-
-        mock_npu_op.return_value = (torch.randn(1, 3))
-        sampler = topk_topp_sampler.TopKTopPSampler()
-
-        logits = torch.tensor([[1.0, 2.0, 3.0]])
-        k = torch.tensor([2])
-        p = torch.tensor([0.9])
-        generators = {0: torch.Generator()}
-        generators[0].manual_seed(42)
-
-        sampler.forward_native(logits, generators, k, p)
-        mock_npu_op.assert_called_once_with(logits, p, k)
diff --git a/tests/ut/sample/test_sampler.py b/tests/ut/sample/test_sampler.py
new file mode 100644
index 00000000000..98a83e6f270
--- /dev/null
+++ b/tests/ut/sample/test_sampler.py
@@ -0,0 +1,32 @@
+from unittest import mock
+
+import torch
+
+from tests.ut.base import TestBase
+from vllm_ascend.sample.sampler import AscendSampler, AscendTopKTopPSampler
+
+
+class TestAscendSampler(TestBase):
+
+    def test_init_with_raw_logprobs(self):
+        sampler = AscendSampler(logprobs_mode="raw_logprobs")
+        self.assertEqual(sampler.logprobs_mode, "raw_logprobs")
+        self.assertTrue(hasattr(sampler, 'topk_topp_sampler'))
+        self.assertIsInstance(sampler.topk_topp_sampler, AscendTopKTopPSampler)
+
+
+class TestAscendTopKTopPSampler(TestBase):
+
+    @mock.patch("torch_npu.npu_top_k_top_p")
+    def test_npu_topk_topp_called_when_optimized(self, mock_npu_op):
+        mock_npu_op.return_value = (torch.randn(1, 3))
+        sampler = AscendTopKTopPSampler()
+
+        logits = torch.tensor([[1.0, 2.0, 3.0]])
+        k = torch.tensor([2])
+        p = torch.tensor([0.9])
+        generators = {0: torch.Generator()}
+        generators[0].manual_seed(42)
+
+        sampler.forward_native(logits, generators, k, p)
+        mock_npu_op.assert_called_once_with(logits, p, k)
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
index 586b846c276..ed169c327f3 100644
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -128,11 +128,11 @@
     "VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE":
     lambda: int(
         os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", 64)),
-    # Whether to enable the topk optimization. It's disabled by default for experimental support
-    # We'll make it enabled by default in the future.
+    # Whether to enable the topk optimization. It's enabled by default. Please set to False if you hit any issue.
+    # We'll remove this flag in the future once it's stable enough.
     "VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION":
     lambda: bool(
-        int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION", '0'))),
+        int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION", '1'))),
 
     # `LLMDataDistCMgrConnector` required variable. `DISAGGREGATED_PREFILL_RANK_TABLE_PATH` is
     # used for llmdatadist to build the communication topology for kv cache transfer, it is
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
index 3446c450139..f22d948f73a 100644
--- a/vllm_ascend/patch/__init__.py
+++ b/vllm_ascend/patch/__init__.py
@@ -88,21 +88,7 @@
 #    Future Plan:
 #       Remove this patch once pytorch 2.7.0 is supported for vllm ascend.
 #
-# ** File: worker/patch_common/patch_sampler.py **
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.v1.sample.sampler.Sampler.apply_top_k_top_p`
-#    Why:
-#       We need to use the patched `apply_top_k_top_p` in `sample`.
-#       The mainly reason to overwrite `apply_top_k_top_p` is
-#       to improve performance.
-#    How：
-#       Re-implementation the `apply_top_k_top_p` function by pytorch
-#    Related PR (if no, explain why):
-#       - https://github.com/vllm-project/vllm-ascend/pull/1732
-#    Future Plan:
-#       Revert it when the ascend scatter performance improves.
-#
-# ** File: worker/patch_common/patch_sampler.py **
+# ** File: worker/patch_0_10_0/patch_sampler_gather_logprobs.py **
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.sample.sampler.Sampler.gather_logprobs`
 #    Why:
diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py
index 8eebcdf4aee..2533d13e3d5 100644
--- a/vllm_ascend/patch/worker/patch_common/__init__.py
+++ b/vllm_ascend/patch/worker/patch_common/__init__.py
@@ -21,4 +21,3 @@
 import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_linear  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_sampler  # noqa
diff --git a/vllm_ascend/patch/worker/patch_common/patch_sampler.py b/vllm_ascend/patch/worker/patch_common/patch_sampler.py
deleted file mode 100644
index e745bf04e78..00000000000
--- a/vllm_ascend/patch/worker/patch_common/patch_sampler.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# SPDX-License-Identifier: Apache-2.0
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import Optional
-
-import torch
-import torch_npu
-from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
-
-from vllm_ascend import envs
-
-
-def apply_top_k_top_p(
-    logits: torch.Tensor,
-    k: torch.Tensor,
-    p: torch.Tensor,
-) -> torch.Tensor:
-    if p is not None and k is not None:
-        # npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p)
-        return torch_npu.npu_top_k_top_p(logits, p, k)
-
-    probs = logits.softmax(dim=-1)
-    probs_sort, _ = probs.sort(dim=-1, descending=False)
-
-    if k is not None:
-        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
-        top_k_count = top_k_count.unsqueeze(dim=1)
-        top_k_cutoff = probs_sort.gather(-1, top_k_count)
-
-        # Make sure the no top-k rows are no-op.
-        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
-        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
-
-        elements_to_discard = probs < top_k_cutoff
-        logits.masked_fill_(elements_to_discard, -float("inf"))
-
-    if p is not None:
-        cumprob = torch.cumsum(probs_sort, dim=-1)
-        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
-        top_p_mask[:, -1] = False  # at least one
-
-        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
-        top_p_cutoff = probs_sort.gather(-1, top_p_count)
-        elements_to_discard = probs < top_p_cutoff
-        logits.masked_fill_(elements_to_discard, -float("inf"))
-
-    return logits
-
-
-def topk_topp_forward_native(
-    self,
-    logits: torch.Tensor,
-    generators: dict[int, torch.Generator],
-    k: Optional[torch.Tensor],
-    p: Optional[torch.Tensor],
-) -> torch.Tensor:
-    """
-    PyTorch-native implementation of top-k and top-p sampling.
-
-    The logits tensor may be updated in-place.
-    """
-    logits = apply_top_k_top_p(logits, k, p)
-    probs = logits.softmax(dim=-1, dtype=torch.float32)
-    return random_sample(probs, generators)
-
-
-if envs.VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION:
-    TopKTopPSampler.forward_native = topk_topp_forward_native
diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py
new file mode 100644
index 00000000000..862bd03e1b2
--- /dev/null
+++ b/vllm_ascend/sample/sampler.py
@@ -0,0 +1,62 @@
+import torch
+import torch_npu
+from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
+from vllm.v1.sample.sampler import Sampler
+
+
+class AscendSampler(Sampler):
+
+    def __init__(self, logprobs_mode="raw_logprobs"):
+        # TODO: support logprobs_mode in vllm-ascend
+        super().__init__(logprobs_mode=logprobs_mode)
+        self.topk_topp_sampler = AscendTopKTopPSampler()
+
+
+class AscendTopKTopPSampler(TopKTopPSampler):
+
+    def _apply_top_k_top_p(
+        self,
+        logits: torch.Tensor,
+        k: torch.Tensor,
+        p: torch.Tensor,
+    ) -> torch.Tensor:
+        if p is not None and k is not None:
+            # npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p)
+            return torch_npu.npu_top_k_top_p(logits, p, k)
+
+        if p is None and k is None:
+            return logits
+
+        probs = logits.softmax(dim=-1)
+        probs_sort, _ = probs.sort(dim=-1, descending=False)
+
+        if k is not None:
+            top_k_count = probs_sort.size(1) - k.to(
+                torch.long)  # shape: (batch, )
+            top_k_count = top_k_count.unsqueeze(dim=1)
+            top_k_cutoff = probs_sort.gather(-1, top_k_count)
+
+            # Make sure the no top-k rows are no-op.
+            no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
+            top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
+
+            elements_to_discard = probs < top_k_cutoff
+            logits.masked_fill_(elements_to_discard, -float("inf"))
+
+        if p is not None:
+            cumprob = torch.cumsum(probs_sort, dim=-1)
+            top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
+            top_p_mask[:, -1] = False  # at least one
+
+            top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
+            top_p_cutoff = probs_sort.gather(-1, top_p_count)
+            elements_to_discard = probs < top_p_cutoff
+            logits.masked_fill_(elements_to_discard, -float("inf"))
+
+        return logits
+
+    def forward_native(self, logits, generators, k, p):
+        """Override pytorch native implementation to torch_npu"""
+        logits = self._apply_top_k_top_p(logits, k, p)
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        return random_sample(probs, generators)
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 2bee8dd442d..886ccb89622 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -64,7 +64,6 @@
                              ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.sample.sampler import Sampler
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
@@ -72,6 +71,7 @@
                                   sanity_check_mm_encoder_outputs,
                                   scatter_mm_placeholders)
 
+from vllm_ascend import envs
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import set_ascend_forward_context
 from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
@@ -165,7 +165,15 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.dp_rank = vllm_config.parallel_config.data_parallel_rank
         self.device = device
         self.dtype = self.model_config.dtype
-        self.sampler = Sampler()
+        if envs.VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION:
+            # TODO: drop the env config to use ascend sampler by default
+            from vllm_ascend.sample.sampler import AscendSampler
+
+            self.sampler = AscendSampler()
+        else:
+            from vllm.v1.sample.sampler import Sampler
+
+            self.sampler = Sampler()
 
         # Lazy initialization, these will be set after __init__
         self.kv_caches: List[torch.Tensor] = []

From 55c2138e143d6dd0b7dc7b6cfb0761b26622e4c9 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Wed, 30 Jul 2025 09:08:00 +0800
Subject: [PATCH 19/56] [CI] Fix test on pyhccl to 2 cards (#2094)

### What this PR does / why we need it?
Fix test on pyhccl to 2 cards

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/0d0cc9e15001b18997207fc86af6810500d587d9

Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/e2e/multicard/test_pyhccl_distributed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/multicard/test_pyhccl_distributed.py b/tests/e2e/multicard/test_pyhccl_distributed.py
index e3d9aedf156..2300e0a2258 100644
--- a/tests/e2e/multicard/test_pyhccl_distributed.py
+++ b/tests/e2e/multicard/test_pyhccl_distributed.py
@@ -89,7 +89,7 @@ def worker_fn():
 
 
 def test_pyhccl():
-    distributed_run(worker_fn, 4)
+    distributed_run(worker_fn, 2)
 
 
 def broadcast_worker_fn():
@@ -118,4 +118,4 @@ def broadcast_worker_fn():
 
 
 def test_pyhccl_broadcast():
-    distributed_run(broadcast_worker_fn, 4)
+    distributed_run(broadcast_worker_fn, 2)

From 868aa2f7002ad4c989eaae5e7235772161f45cc5 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 09:34:05 +0800
Subject: [PATCH 20/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/attention/attention_v1.py |  13 +
 vllm_ascend/models/__init__.py        |  10 +-
 vllm_ascend/models/qwen3_dbo.py       | 552 ++++++++++++++++++++++++++
 vllm_ascend/multistream/ms_split.py   | 115 +++++-
 4 files changed, 686 insertions(+), 4 deletions(-)
 create mode 100644 vllm_ascend/models/qwen3_dbo.py

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 668c802c400..01b51e15607 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -28,6 +28,7 @@
 from vllm.utils import direct_register_custom_op
 from vllm.v1.core.sched.output import SchedulerOutput
 
+from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
                                nd_to_nz_2d, nd_to_nz_spec)
@@ -150,6 +151,18 @@ class AscendMetadata:
     # (num_tokens,)
     slot_mapping: torch.Tensor = None
 
+    def split_metadata_for_multistream(
+        self,
+        ms_split_config: MSAttentionMetadataSplitConfig,
+    ) -> list["AscendMetadata"]:
+        """Split metadata for multi-stream with AscendMetadata"""
+        from vllm_ascend.multistream.ms_split import model_input_split_v1_attn
+        return model_input_split_v1_attn(
+            ms_split_config=ms_split_config,
+            attn_metadata=self,
+            _metadata_cls=AscendMetadata,
+        )
+
 
 class AscendAttentionMetadataBuilder:
 
diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py
index 0b1b67a4f19..e3609f802d7 100644
--- a/vllm_ascend/models/__init__.py
+++ b/vllm_ascend/models/__init__.py
@@ -41,6 +41,10 @@ def register_model():
             "DeepseekV3ForCausalLM",
             "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM")
 
+        ModelRegistry.register_model(
+            "Qwen3MoeForCausalLM",
+            "vllm_ascend.models.qwen3_dbo:CustomQwen3MoeForCausalLMDBO")
+
     else:
         ModelRegistry.register_model(
             "DeepseekV2ForCausalLM",
@@ -50,9 +54,9 @@ def register_model():
             "DeepseekV3ForCausalLM",
             "vllm_ascend.models.deepseek_v3:CustomDeepseekV3ForCausalLM")
 
-    ModelRegistry.register_model(
-        "Qwen3MoeForCausalLM",
-        "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM")
+        ModelRegistry.register_model(
+            "Qwen3MoeForCausalLM",
+            "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM")
 
     ModelRegistry.register_model(
         "Qwen3ForCausalLM", "vllm_ascend.models.qwen3:CustomQwen3ForCausalLM")
diff --git a/vllm_ascend/models/qwen3_dbo.py b/vllm_ascend/models/qwen3_dbo.py
new file mode 100644
index 00000000000..fa87fe81f22
--- /dev/null
+++ b/vllm_ascend/models/qwen3_dbo.py
@@ -0,0 +1,552 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+# """Inference-only Qwen3 model."""
+from types import SimpleNamespace
+from typing import List, Optional, Union
+
+import torch
+import torch_npu
+import vllm.model_executor.models.qwen3_moe as qwen3
+from torch import nn
+from transformers import PretrainedConfig
+from vllm.attention import AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              get_tp_group)
+from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.models.qwen3_moe import (Qwen3MoeDecoderLayer,
+                                                  Qwen3MoeForCausalLM,
+                                                  Qwen3MoeModel)
+from vllm.model_executor.models.utils import (
+    make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
+from vllm.sequence import IntermediateTensors
+
+import vllm_ascend.envs as envs_ascend
+from vllm_ascend.distributed.tensor_parallel import \
+    gather_from_sequence_parallel_region
+from vllm_ascend.multistream.base import MSEventKey
+from vllm_ascend.multistream.context import (
+    advance_step_multistream_layer_context, get_multistream_layer_context)
+from vllm_ascend.multistream.layers import (MultiStreamPostTransformerLayer,
+                                            MultiStreamPreTransformerLayer)
+from vllm_ascend.multistream.metadata import (MultiStreamConfig,
+                                              MultiStreamStepMetadata,
+                                              make_multistream_metadata_ds)
+from vllm_ascend.ops.fused_moe import (AscendSparseMoeBlock, apply_mlp,
+                                       select_experts)
+
+VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO
+
+
+class Qwen3MoeDecoderLayerDBO(Qwen3MoeDecoderLayer):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super(Qwen3MoeDecoderLayerDBO, self).__init__(config, cache_config,
+                                                      quant_config, prefix)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tp_group().rank_in_group
+        self.tp_group = get_tp_group().device_group
+        self.dummy_vllm_config = SimpleNamespace(
+            parallel_config=SimpleNamespace(data_parallel_size=1, ),
+            compilation_config=SimpleNamespace(static_forward_context=None, ),
+            other_setting="value",
+        )
+        self.config = config
+
+    def forward(self, *args, **kwargs):
+        return super().forward(*args, **kwargs)
+
+    # should split ops in Decoder Layer
+    def _forward_ms_op_input_layernorm(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        return hidden_states, residual
+
+    def _forward_ms_op_attn(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        kv_cache: Optional[torch.Tensor] = None,
+        attn_metadata: Optional[AttentionMetadata] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        self.dummy_vllm_config.compilation_config.static_forward_context = (
+            get_forward_context().no_compile_layers)
+        with set_forward_context(attn_metadata, self.dummy_vllm_config):
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+            )
+        if hidden_states.dtype == torch.float16:
+            # Fix FP16 overflow
+            # We scale both hidden_states and residual before
+            # rmsnorm, and rmsnorm result would not affect by scale.
+            hidden_states *= 1.0 / self.routed_scaling_factor
+            if self.layer_idx == 0:
+                # The residual is shared by all layers, we only scale it on
+                # first layer.
+                residual *= 1.0 / self.routed_scaling_factor
+        return hidden_states, residual
+
+    def _forward_ms_op_post_attn_layernorm(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ):
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        return hidden_states, residual
+
+    def _forward_op_gating(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: Optional[AttentionMetadata] = None,
+    ) -> torch.Tensor:
+        if attn_metadata is None:
+            attn_metadata = get_forward_context().attn_metadata
+        # when profile runs, force experts to load balanced tokens
+        # to avoid high memory consumption on a single rank.
+        enable_force_load_balance = get_forward_context().in_profile_run
+
+        num_tokens, hidden_dim = hidden_states.shape
+
+        if self.tp_size > 1:
+            # pass
+            num_tokens, hidden_size = hidden_states.shape
+            if num_tokens < self.tp_size:
+                hidden_states = nn.functional.pad(
+                    hidden_states, (0, 0, 0, self.tp_size - num_tokens))
+            chunk_hidden_states = torch.tensor_split(hidden_states,
+                                                     self.tp_size,
+                                                     dim=0)
+            chunked_hidden_states_sizes = [
+                x.shape[0] for x in chunk_hidden_states
+            ]
+            local_hidden_states = chunk_hidden_states[self.tp_rank]
+        else:
+            local_hidden_states = hidden_states
+            chunked_hidden_states_sizes = None
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.mlp.gate(local_hidden_states)
+
+        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
+        mlp_config = self.config
+        if mlp_config.num_experts == 256:
+            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
+                router_logits,
+                k=mlp_config.num_experts_per_tok,  # topk当前写8
+                bias=self.mlp.gate.e_score_correction_bias,
+                k_group=mlp_config.topk_group,  # fix: 4
+                group_count=mlp_config.n_group,  # fix 8
+                group_select_mode=1,  # 0: max in group; 1: topk2.sum(fix)
+                renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
+                norm_type=1,  # 0: softmax; 1: sigmoid(fix)
+                routed_scaling_factor=1,
+                eps=float(1e-20),
+            )
+        else:
+            topk_weights, topk_ids = select_experts(
+                hidden_states=local_hidden_states,
+                router_logits=router_logits,
+                top_k=mlp_config.num_experts_per_tok,
+                use_grouped_topk=False,
+                renormalize=mlp_config.norm_topk_prob,
+                topk_group=getattr(mlp_config, "topk_group", None),
+                num_expert_group=getattr(mlp_config, "n_group", None),
+                custom_routing_function=None,
+                scoring_func=getattr(mlp_config, "scoring_func", "softmax"),
+                e_score_correction_bias=getattr(self.mlp.gate,
+                                                "e_score_correction_bias",
+                                                None),
+            )
+
+        topk_weights = topk_weights.to(hidden_states.dtype)
+        # this is a naive implementation for experts load balance so as
+        # to avoid accumulating too much tokens on a single rank.
+        # currently it is only activated when doing profile runs.
+        if enable_force_load_balance:
+            topk_ids = torch.randint_like(topk_ids, 0, self.config.num_experts)
+
+        return topk_weights, topk_ids, local_hidden_states, chunked_hidden_states_sizes
+
+    def _forward_op_grouped_mlp(self, dispatched_input, tokens_per_expert):
+        return apply_mlp(
+            dispatched_input,
+            self.mlp.experts.w13_weight,
+            self.mlp.experts.w2_weight,
+            tokens_per_expert,
+        )
+
+    def _forward_combine_comm(self, hidden_states, microbatch_id, num_tokens,
+                              chunked_hidden_states_sizes):
+        token_dispatcher = self.mlp.experts.token_dispatchers[microbatch_id]
+        final_hidden_states, _ = token_dispatcher.token_unpermutation(
+            hidden_states)
+        if hasattr(self.mlp, "routed_scaling_factor"):
+            final_hidden_states = final_hidden_states * self.mlp.routed_scaling_factor
+
+        if self.tp_size > 1:
+            final_hidden_states = gather_from_sequence_parallel_region(
+                final_hidden_states, self.tp_group,
+                chunked_hidden_states_sizes)
+            if num_tokens < self.tp_size:
+                final_hidden_states = final_hidden_states[:num_tokens]
+
+        if hasattr(self.mlp, "shared_experts"):
+            final_hidden_states = (
+                final_hidden_states +
+                token_dispatcher.cached_shared_expert_output)
+            token_dispatcher.cached_shared_expert_output.untyped_storage(
+            ).resize_(0)
+            token_dispatcher.cached_shared_expert_output = None
+
+        final_hidden_states = final_hidden_states.view(num_tokens, -1)
+
+        return final_hidden_states
+
+    def _forward_ms_layer_alltoallv_finegrained(
+        self,
+        positions: List[torch.Tensor],
+        hidden_states: List[torch.Tensor],
+        residual: List[torch.Tensor],
+        attn_metadata: List[AttentionMetadata],
+        kv_cache: Optional[torch.Tensor] = None,
+    ):
+        layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
+        )
+        assert layer_index >= 0 and ms_metadata is not None
+        num_micro_batchs = ms_metadata.ms_config.num_micro_batches
+        assert len(positions) == num_micro_batchs
+        assert len(hidden_states) == num_micro_batchs
+        assert residual is not None
+        assert attn_metadata is not None
+        num_tokens = [None] * num_micro_batchs
+        hidden_dims = [None] * num_micro_batchs
+        topk_weights, topk_ids = [None] * num_micro_batchs, [
+            None
+        ] * num_micro_batchs
+        tokens_per_expert = [None] * num_micro_batchs
+        dispatched_input = [None] * num_micro_batchs
+        router_expert_output = [None] * num_micro_batchs
+        chunked_hidden_states_sizes = [None] * num_micro_batchs
+        token_dispatchers = self.mlp.experts.token_dispatchers
+
+        def discard_tensor(tensor):
+            if isinstance(tensor, torch.Tensor):
+                tensor = [tensor]
+            for t in tensor:
+                t.untyped_storage().resize_(0)
+
+        # block 1 : attention
+        # block 2 : Router Gating
+        # block 3 : Token DisPatch
+        # the attn computation of microbatch 1 can be overlapped with the moe
+        # communication in the previous layer, and the attn computation of microbatch 2
+        # can be overlapped with the attn communication of microbatch 1
+        for i in range(num_micro_batchs):
+            forward_context = get_forward_context()
+            layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
+            )
+            ms_metadata.try_wait_event(layer_index - 1, i,
+                                       MSEventKey.FFN_AR_FINISH)
+            forward_context.attn_metadata = attn_metadata[i]
+
+            # input layernorm
+            hidden_states[i], residual[
+                i] = self._forward_ms_op_input_layernorm(
+                    hidden_states[i], residual[i])
+            # attention and tp allreduce
+            hidden_states[i], residual[i] = self._forward_ms_op_attn(
+                positions[i], hidden_states[i], residual[i], kv_cache,
+                attn_metadata[i])
+            # post attention layer norm
+            hidden_states[i], residual[
+                i] = self._forward_ms_op_post_attn_layernorm(
+                    hidden_states[i], residual[i])
+            num_tokens[i], hidden_dims[i] = hidden_states[i].shape
+            # If TP is enabled, hidden_states will be chunked.
+            (
+                topk_weights[i],
+                topk_ids[i],
+                dispatched_input[i],
+                chunked_hidden_states_sizes[i],
+            ) = self._forward_op_gating(hidden_states[i], attn_metadata[i])
+            token_dispatchers[i].preprocess_and_permtute1(
+                dispatched_input[i],
+                topk_weights[i],
+                topk_ids[i],
+                shared_experts=None,
+                shared_experts_input=None,
+            )
+            # Launch DisPatch Comm in a New Stream.
+            dispatch_context = MultiStreamStepMetadata(
+                comm_stream=ms_metadata.communicate_stream,
+                before_comm_event=ms_metadata.ms_events[layer_index][i][
+                    MSEventKey.MOE_BEFORE_COMM],
+                after_comm_event=ms_metadata.ms_events[layer_index][i][
+                    MSEventKey.MOE_AFTER_COMM],
+            )
+            dispatch_context.before_comm_event.record()
+            # print_with_sync(f'begin token dispatch{i}...', torch.distributed.get_rank())
+            with torch.npu.stream(dispatch_context.comm_stream):
+                dispatch_context.comm_stream.wait_event(
+                    dispatch_context.before_comm_event)
+                token_dispatchers[i].dispatch_alltoall()
+                dispatched_input[i], tokens_per_expert[i] = token_dispatchers[
+                    i].permute2()
+                dispatch_context.after_comm_event.record()
+
+        # print_with_sync('begin experts...', torch.distributed.get_rank())
+        # block 4 : Router Experts Computation
+        # block 5 : Token Combine Communication
+        for i in range(num_micro_batchs):
+            ms_metadata.try_wait_event(layer_index, i,
+                                       MSEventKey.MOE_AFTER_COMM)
+            discard_tensor(hidden_states[i])
+            router_expert_output[i] = self._forward_op_grouped_mlp(
+                dispatched_input[i], tokens_per_expert[i])
+            discard_tensor(dispatched_input[i])
+
+            # Launch Combine Comm in a New Stream.
+            combine_context = MultiStreamStepMetadata(
+                comm_stream=ms_metadata.communicate_stream,
+                before_comm_event=ms_metadata.ms_events[layer_index][i][
+                    MSEventKey.FFN_COM_FINISH],
+                after_comm_event=ms_metadata.ms_events[layer_index][i][
+                    MSEventKey.FFN_AR_FINISH],
+            )
+            combine_context.before_comm_event.record()
+            ms_metadata.try_wait_event(layer_index, i,
+                                       MSEventKey.MOE_SE_COMM_FINISH)
+            with torch.npu.stream(combine_context.comm_stream):
+                combine_context.comm_stream.wait_event(
+                    combine_context.before_comm_event)
+                hidden_states[i] = self._forward_combine_comm(
+                    router_expert_output[i],
+                    i,
+                    num_tokens[i],
+                    chunked_hidden_states_sizes[i],
+                )
+                ms_metadata.ms_events[layer_index][i][
+                    MSEventKey.
+                    FFN_AR_FINISH] = combine_context.comm_stream.record_event(
+                    )
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class CustomQwen3DBOMoEModel(Qwen3MoeModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=f"{prefix}.embed_tokens")
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Qwen3MoeDecoderLayerDBO(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size)
+
+        # dbo related members
+        if VLLM_ASCEND_ENABLE_DBO:
+            self.use_mla = False
+            self.multistream_config = MultiStreamConfig()
+            multistream_metadata = make_multistream_metadata_ds(
+                start_layer=self.start_layer,
+                end_layer=self.end_layer,
+                causal_lm=getattr(config, "causal_lm", True),
+                multistream_config=self.multistream_config,
+            )
+            self.ms_pre_layer = MultiStreamPreTransformerLayer(
+                multistream_metadata)
+            self.ms_post_layer = MultiStreamPostTransformerLayer(
+                multistream_metadata)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        num_normal_layers = (0 if VLLM_ASCEND_ENABLE_DBO and self.can_run_ms()
+                             else self.end_layer - self.start_layer)
+
+        moe_start_layer = self.start_layer + num_normal_layers
+        for i in range(self.start_layer, min(moe_start_layer, self.end_layer)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if moe_start_layer < self.end_layer:
+            # if we enable multistream/dbo, process sparse layers here
+            hidden_states, residual = self._forward_ms_layers(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                moe_start_layer=moe_start_layer,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def can_run_ms(self):
+        attn_metadata = get_forward_context().attn_metadata
+        # enable prefill overlap
+        with_prefill = get_forward_context().with_prefill
+        if (attn_metadata is None or not with_prefill
+                or not attn_metadata.enable_dbo_across_dp):
+            return False
+
+        return True
+
+    def _forward_ms_layers(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        moe_start_layer: int,
+        kv_caches: Optional[List[torch.Tensor]] = None,
+    ):
+
+        if moe_start_layer == self.end_layer:
+            return hidden_states, residual
+
+        attn_metadata, [positions, hidden_states,
+                        residual] = self.ms_pre_layer(
+                            [positions, hidden_states, residual], )
+        num_micro_batch = len(attn_metadata)
+        # the rest layers
+        for i in range(moe_start_layer, self.end_layer):
+            layer = self.layers[i]
+            ms_layer_forward_func = layer._forward_ms_layer_alltoallv_finegrained
+            # print("get_called......")
+            hidden_states, residual = ms_layer_forward_func(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                attn_metadata=attn_metadata,
+            )
+            advance_step_multistream_layer_context()
+
+        layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
+        )
+        for i in range(num_micro_batch):
+            ms_metadata.try_wait_event(layer_index - 1, i,
+                                       MSEventKey.FFN_AR_FINISH)
+
+        [hidden_states,
+         residual] = self.ms_post_layer([hidden_states, residual], )
+        return hidden_states, residual
+
+
+class CustomQwen3MoeForCausalLMDBO(Qwen3MoeForCausalLM):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+    }
+    qwen3.Qwen3MoeSparseMoeBlock = AscendSparseMoeBlock
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = CustomQwen3DBOMoEModel(vllm_config=vllm_config,
+                                            prefix=maybe_prefix(
+                                                prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def forward(self, *args, **kwargs):
+        if "graph_enable" in kwargs:
+            kwargs.pop("graph_enable")
+        return super().forward(*args, **kwargs)
diff --git a/vllm_ascend/multistream/ms_split.py b/vllm_ascend/multistream/ms_split.py
index 3af6337e473..684f6aea136 100644
--- a/vllm_ascend/multistream/ms_split.py
+++ b/vllm_ascend/multistream/ms_split.py
@@ -4,7 +4,8 @@
 import numpy as np
 import torch
 
-from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
+                                                AscendMetadata)
 
 from .base import MSAttentionMetadataSplitConfig
 
@@ -241,3 +242,115 @@ def model_input_split_v1_mla_attn(
         decode=decode_post,
     )
     return [attention_metadata_pre, attention_metadata_post]
+
+
+def model_input_split_v1_attn(
+    attn_metadata: AscendMetadata,
+    _metadata_cls,
+    ms_split_config: MSAttentionMetadataSplitConfig,
+) -> List[Any]:
+    assert 0 < ms_split_config.num_micro_batches < 3
+    if attn_metadata is None:
+        return [attn_metadata]
+    [token_index,
+     seq_index] = compute_split_seq_index(attn_metadata.query_lens,
+                                          attn_metadata.attn_state,
+                                          attn_metadata.num_actual_tokens)
+    if token_index == 0 or seq_index == 0 or seq_index == len(
+            attn_metadata.query_lens):
+        return [attn_metadata]
+
+    # split attn metadata
+
+    [block_table_pre,
+     block_table_post] = split_attn_tensor_type(attn_metadata.block_tables,
+                                                seq_index)
+
+    query_start_loc_pre = query_start_loc_post = None
+    if attn_metadata.query_start_loc is not None:
+        query_start_loc_pre = attn_metadata.query_start_loc[:seq_index + 1]
+        query_start_loc_post = deepcopy(
+            attn_metadata.query_start_loc[seq_index:]
+        ) - attn_metadata.query_start_loc[seq_index]
+
+    [query_lens_pre,
+     query_lens_post] = split_attn_tensor_type(attn_metadata.query_lens,
+                                               seq_index)
+    [seq_lens_pre,
+     seq_lens_post] = split_attn_tensor_type(attn_metadata.seq_lens, seq_index)
+
+    max_query_len_pre = max_query_len_post = None
+    if attn_metadata.max_query_len is not None:
+        max_query_len_pre, max_query_len_post = max(query_lens_pre), max(
+            query_lens_post)
+
+    [slot_mapping_pre,
+     slot_mapping_post] = split_attn_tensor_type(attn_metadata.slot_mapping,
+                                                 token_index)
+
+    is_only_prefill_pre = is_only_prefill_post = attn_metadata.is_only_prefill
+    has_prefill_pre, _ = torch.any(query_lens_pre > 1).item(), torch.any(
+        query_lens_post > 1).item()
+
+    if not attn_metadata.is_only_prefill:
+        is_only_prefill_post = torch.all(query_lens_post > 1).item()
+
+    if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache or attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
+        # the attn_mla kernel in torch npu only accept 128*128 attn mask
+        attn_mask_pre = attn_mask_post = attn_metadata.attn_mask
+        attn_state_pre = attn_state_post = attn_metadata.attn_state
+    elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
+        # should be none in decode only state
+        attn_mask_pre = attn_mask_post = attn_metadata.attn_mask
+        attn_state_pre = attn_state_post = AscendAttentionState.DecodeOnly  # type: ignore
+    else:
+        # chunked prefill
+        assert attn_metadata.attn_mask is not None
+        if has_prefill_pre:
+            attn_state_pre = attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
+            attn_mask_pre = attn_metadata.attn_mask[:token_index, :max(
+                seq_lens_pre)].contiguous()
+            attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
+            attn_mask_post = attn_metadata.attn_mask[
+                token_index:, :max(seq_lens_post)].contiguous()
+        else:
+            attn_state_pre = AscendAttentionState.DecodeOnly  # type: ignore
+            attn_mask_pre = None
+            attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
+            attn_mask_post = attn_metadata.attn_mask[
+                token_index:, :max(seq_lens_post)].contiguous()
+
+    # construct metadata
+    attention_metadata_pre = _metadata_cls(
+        num_actual_tokens=token_index,
+        block_tables=block_table_pre,
+        query_start_loc=query_start_loc_pre,
+        query_lens=query_lens_pre,
+        seq_lens=seq_lens_pre,
+        seq_lens_list=seq_lens_pre.tolist(),
+        max_query_len=max_query_len_pre,
+        slot_mapping=slot_mapping_pre,
+        is_only_prefill=is_only_prefill_pre,
+        attn_state=attn_state_pre,
+        attn_mask=attn_mask_pre,
+        num_input_tokens=token_index,
+        enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp,
+    )
+
+    attention_metadata_post = _metadata_cls(
+        num_actual_tokens=attn_metadata.num_actual_tokens - token_index,
+        block_tables=block_table_post,
+        query_start_loc=query_start_loc_post,
+        query_lens=query_lens_post,
+        seq_lens=seq_lens_post,
+        seq_lens_list=seq_lens_post.tolist(),
+        max_query_len=max_query_len_post,
+        slot_mapping=slot_mapping_post,
+        is_only_prefill=is_only_prefill_post,
+        attn_state=attn_state_post,
+        attn_mask=attn_mask_post,
+        num_input_tokens=attn_metadata.num_input_tokens - token_index,
+        enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp,
+    )
+
+    return [attention_metadata_pre, attention_metadata_post]

From 71bc50bc1f619a99fb7a0c730715d50785e75fa3 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 09:46:27 +0800
Subject: [PATCH 21/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/e2e/multicard/test_offline_inference_distributed.py | 6 ++++--
 vllm_ascend/ops/fused_moe.py                              | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index bf1269a8f62..884db3e36ba 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -149,8 +149,10 @@ def test_models_distributed_topk() -> None:
                                      top_p=0.9)
 
     with VllmRunner(
-            "deepseek-ai/DeepSeek-V2-Lite",
-            dtype=dtype,
+            snapshot_download("vllm-ascend/pangu-pro-moe-pruing"),
+            max_model_len=8192,
+            enforce_eager=True,
+            dtype="auto",
             tensor_parallel_size=2,
             distributed_executor_backend="mp",
     ) as vllm_model:
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index cc9e47afc4c..c39446f06f5 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -1486,7 +1486,7 @@ def forward(self,
             mc2_mask=mc2_mask,
             quantized_x_for_share=quantized_x_for_share,
             dynamic_scale_for_share=dynamic_scale_for_share,
-            token_dispatcher=self.token_dispatcher,
+            token_dispatcher=self.token_dispatcher
         )
 
         if shared_experts:

From b118bbd9ab43e41a4aa027e972cd05a734fa24dd Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 09:49:33 +0800
Subject: [PATCH 22/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 .../test_offline_inference_distributed.py      | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index 884db3e36ba..3ecab6aa009 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -119,24 +119,6 @@ def test_models_distributed_DeepSeekV3_dbo():
 
 
 def test_models_distributed_pangu():
-    example_prompts = [
-        "Hello, my name is",
-    ]
-    max_tokens = 5
-
-    with VllmRunner(
-            snapshot_download("vllm-ascend/pangu-pro-moe-pruing"),
-            max_model_len=8192,
-            enforce_eager=True,
-            dtype="auto",
-            tensor_parallel_size=2,
-            distributed_executor_backend="mp",
-    ) as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
-
-
-@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION": "1"})
-def test_models_distributed_topk() -> None:
     example_prompts = [
         "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
         "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",

From 978f43013a3ba6733388eab412f6a48042daa696 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 09:50:44 +0800
Subject: [PATCH 23/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/ops/fused_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index c39446f06f5..c819eeb3391 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -1484,9 +1484,9 @@ def forward(self,
             shared_experts=shared_experts if self.torchair_graph_enabled
             and self.enable_multistream_moe and not is_prefill else None,
             mc2_mask=mc2_mask,
+            token_dispatcher=self.token_dispatcher,
             quantized_x_for_share=quantized_x_for_share,
             dynamic_scale_for_share=dynamic_scale_for_share,
-            token_dispatcher=self.token_dispatcher
         )
 
         if shared_experts:

From 2922d9ea4355e45aea07c4ec735f17c112c28bbf Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 10:35:37 +0800
Subject: [PATCH 24/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/multistream/ms_split.py | 112 ----------------------------
 1 file changed, 112 deletions(-)

diff --git a/vllm_ascend/multistream/ms_split.py b/vllm_ascend/multistream/ms_split.py
index 684f6aea136..7d58506f662 100644
--- a/vllm_ascend/multistream/ms_split.py
+++ b/vllm_ascend/multistream/ms_split.py
@@ -242,115 +242,3 @@ def model_input_split_v1_mla_attn(
         decode=decode_post,
     )
     return [attention_metadata_pre, attention_metadata_post]
-
-
-def model_input_split_v1_attn(
-    attn_metadata: AscendMetadata,
-    _metadata_cls,
-    ms_split_config: MSAttentionMetadataSplitConfig,
-) -> List[Any]:
-    assert 0 < ms_split_config.num_micro_batches < 3
-    if attn_metadata is None:
-        return [attn_metadata]
-    [token_index,
-     seq_index] = compute_split_seq_index(attn_metadata.query_lens,
-                                          attn_metadata.attn_state,
-                                          attn_metadata.num_actual_tokens)
-    if token_index == 0 or seq_index == 0 or seq_index == len(
-            attn_metadata.query_lens):
-        return [attn_metadata]
-
-    # split attn metadata
-
-    [block_table_pre,
-     block_table_post] = split_attn_tensor_type(attn_metadata.block_tables,
-                                                seq_index)
-
-    query_start_loc_pre = query_start_loc_post = None
-    if attn_metadata.query_start_loc is not None:
-        query_start_loc_pre = attn_metadata.query_start_loc[:seq_index + 1]
-        query_start_loc_post = deepcopy(
-            attn_metadata.query_start_loc[seq_index:]
-        ) - attn_metadata.query_start_loc[seq_index]
-
-    [query_lens_pre,
-     query_lens_post] = split_attn_tensor_type(attn_metadata.query_lens,
-                                               seq_index)
-    [seq_lens_pre,
-     seq_lens_post] = split_attn_tensor_type(attn_metadata.seq_lens, seq_index)
-
-    max_query_len_pre = max_query_len_post = None
-    if attn_metadata.max_query_len is not None:
-        max_query_len_pre, max_query_len_post = max(query_lens_pre), max(
-            query_lens_post)
-
-    [slot_mapping_pre,
-     slot_mapping_post] = split_attn_tensor_type(attn_metadata.slot_mapping,
-                                                 token_index)
-
-    is_only_prefill_pre = is_only_prefill_post = attn_metadata.is_only_prefill
-    has_prefill_pre, _ = torch.any(query_lens_pre > 1).item(), torch.any(
-        query_lens_post > 1).item()
-
-    if not attn_metadata.is_only_prefill:
-        is_only_prefill_post = torch.all(query_lens_post > 1).item()
-
-    if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache or attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
-        # the attn_mla kernel in torch npu only accept 128*128 attn mask
-        attn_mask_pre = attn_mask_post = attn_metadata.attn_mask
-        attn_state_pre = attn_state_post = attn_metadata.attn_state
-    elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
-        # should be none in decode only state
-        attn_mask_pre = attn_mask_post = attn_metadata.attn_mask
-        attn_state_pre = attn_state_post = AscendAttentionState.DecodeOnly  # type: ignore
-    else:
-        # chunked prefill
-        assert attn_metadata.attn_mask is not None
-        if has_prefill_pre:
-            attn_state_pre = attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
-            attn_mask_pre = attn_metadata.attn_mask[:token_index, :max(
-                seq_lens_pre)].contiguous()
-            attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
-            attn_mask_post = attn_metadata.attn_mask[
-                token_index:, :max(seq_lens_post)].contiguous()
-        else:
-            attn_state_pre = AscendAttentionState.DecodeOnly  # type: ignore
-            attn_mask_pre = None
-            attn_state_post = AscendAttentionState.ChunkedPrefill  # type: ignore
-            attn_mask_post = attn_metadata.attn_mask[
-                token_index:, :max(seq_lens_post)].contiguous()
-
-    # construct metadata
-    attention_metadata_pre = _metadata_cls(
-        num_actual_tokens=token_index,
-        block_tables=block_table_pre,
-        query_start_loc=query_start_loc_pre,
-        query_lens=query_lens_pre,
-        seq_lens=seq_lens_pre,
-        seq_lens_list=seq_lens_pre.tolist(),
-        max_query_len=max_query_len_pre,
-        slot_mapping=slot_mapping_pre,
-        is_only_prefill=is_only_prefill_pre,
-        attn_state=attn_state_pre,
-        attn_mask=attn_mask_pre,
-        num_input_tokens=token_index,
-        enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp,
-    )
-
-    attention_metadata_post = _metadata_cls(
-        num_actual_tokens=attn_metadata.num_actual_tokens - token_index,
-        block_tables=block_table_post,
-        query_start_loc=query_start_loc_post,
-        query_lens=query_lens_post,
-        seq_lens=seq_lens_post,
-        seq_lens_list=seq_lens_post.tolist(),
-        max_query_len=max_query_len_post,
-        slot_mapping=slot_mapping_post,
-        is_only_prefill=is_only_prefill_post,
-        attn_state=attn_state_post,
-        attn_mask=attn_mask_post,
-        num_input_tokens=attn_metadata.num_input_tokens - token_index,
-        enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp,
-    )
-
-    return [attention_metadata_pre, attention_metadata_post]

From 85a70fd88d710410e9af3369ce9b4d950503d0ab Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 10:39:19 +0800
Subject: [PATCH 25/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/attention/attention_v1.py |  12 -
 vllm_ascend/models/__init__.py        |  11 +-
 vllm_ascend/models/qwen3_dbo.py       | 552 --------------------------
 vllm_ascend/multistream/ms_split.py   |   3 +-
 4 files changed, 4 insertions(+), 574 deletions(-)
 delete mode 100644 vllm_ascend/models/qwen3_dbo.py

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 01b51e15607..c2ecebb100a 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -151,18 +151,6 @@ class AscendMetadata:
     # (num_tokens,)
     slot_mapping: torch.Tensor = None
 
-    def split_metadata_for_multistream(
-        self,
-        ms_split_config: MSAttentionMetadataSplitConfig,
-    ) -> list["AscendMetadata"]:
-        """Split metadata for multi-stream with AscendMetadata"""
-        from vllm_ascend.multistream.ms_split import model_input_split_v1_attn
-        return model_input_split_v1_attn(
-            ms_split_config=ms_split_config,
-            attn_metadata=self,
-            _metadata_cls=AscendMetadata,
-        )
-
 
 class AscendAttentionMetadataBuilder:
 
diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py
index e3609f802d7..f47e821b345 100644
--- a/vllm_ascend/models/__init__.py
+++ b/vllm_ascend/models/__init__.py
@@ -40,11 +40,6 @@ def register_model():
         ModelRegistry.register_model(
             "DeepseekV3ForCausalLM",
             "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM")
-
-        ModelRegistry.register_model(
-            "Qwen3MoeForCausalLM",
-            "vllm_ascend.models.qwen3_dbo:CustomQwen3MoeForCausalLMDBO")
-
     else:
         ModelRegistry.register_model(
             "DeepseekV2ForCausalLM",
@@ -54,9 +49,9 @@ def register_model():
             "DeepseekV3ForCausalLM",
             "vllm_ascend.models.deepseek_v3:CustomDeepseekV3ForCausalLM")
 
-        ModelRegistry.register_model(
-            "Qwen3MoeForCausalLM",
-            "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM")
+    ModelRegistry.register_model(
+        "Qwen3MoeForCausalLM",
+        "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM")
 
     ModelRegistry.register_model(
         "Qwen3ForCausalLM", "vllm_ascend.models.qwen3:CustomQwen3ForCausalLM")
diff --git a/vllm_ascend/models/qwen3_dbo.py b/vllm_ascend/models/qwen3_dbo.py
deleted file mode 100644
index fa87fe81f22..00000000000
--- a/vllm_ascend/models/qwen3_dbo.py
+++ /dev/null
@@ -1,552 +0,0 @@
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-
-# """Inference-only Qwen3 model."""
-from types import SimpleNamespace
-from typing import List, Optional, Union
-
-import torch
-import torch_npu
-import vllm.model_executor.models.qwen3_moe as qwen3
-from torch import nn
-from transformers import PretrainedConfig
-from vllm.attention import AttentionMetadata
-from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group,
-                              get_tensor_model_parallel_world_size,
-                              get_tp_group)
-from vllm.forward_context import get_forward_context, set_forward_context
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.models.qwen3_moe import (Qwen3MoeDecoderLayer,
-                                                  Qwen3MoeForCausalLM,
-                                                  Qwen3MoeModel)
-from vllm.model_executor.models.utils import (
-    make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
-from vllm.sequence import IntermediateTensors
-
-import vllm_ascend.envs as envs_ascend
-from vllm_ascend.distributed.tensor_parallel import \
-    gather_from_sequence_parallel_region
-from vllm_ascend.multistream.base import MSEventKey
-from vllm_ascend.multistream.context import (
-    advance_step_multistream_layer_context, get_multistream_layer_context)
-from vllm_ascend.multistream.layers import (MultiStreamPostTransformerLayer,
-                                            MultiStreamPreTransformerLayer)
-from vllm_ascend.multistream.metadata import (MultiStreamConfig,
-                                              MultiStreamStepMetadata,
-                                              make_multistream_metadata_ds)
-from vllm_ascend.ops.fused_moe import (AscendSparseMoeBlock, apply_mlp,
-                                       select_experts)
-
-VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO
-
-
-class Qwen3MoeDecoderLayerDBO(Qwen3MoeDecoderLayer):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super(Qwen3MoeDecoderLayerDBO, self).__init__(config, cache_config,
-                                                      quant_config, prefix)
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tp_group().rank_in_group
-        self.tp_group = get_tp_group().device_group
-        self.dummy_vllm_config = SimpleNamespace(
-            parallel_config=SimpleNamespace(data_parallel_size=1, ),
-            compilation_config=SimpleNamespace(static_forward_context=None, ),
-            other_setting="value",
-        )
-        self.config = config
-
-    def forward(self, *args, **kwargs):
-        return super().forward(*args, **kwargs)
-
-    # should split ops in Decoder Layer
-    def _forward_ms_op_input_layernorm(
-        self,
-        hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        return hidden_states, residual
-
-    def _forward_ms_op_attn(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        residual: torch.Tensor,
-        kv_cache: Optional[torch.Tensor] = None,
-        attn_metadata: Optional[AttentionMetadata] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        self.dummy_vllm_config.compilation_config.static_forward_context = (
-            get_forward_context().no_compile_layers)
-        with set_forward_context(attn_metadata, self.dummy_vllm_config):
-            hidden_states = self.self_attn(
-                positions=positions,
-                hidden_states=hidden_states,
-            )
-        if hidden_states.dtype == torch.float16:
-            # Fix FP16 overflow
-            # We scale both hidden_states and residual before
-            # rmsnorm, and rmsnorm result would not affect by scale.
-            hidden_states *= 1.0 / self.routed_scaling_factor
-            if self.layer_idx == 0:
-                # The residual is shared by all layers, we only scale it on
-                # first layer.
-                residual *= 1.0 / self.routed_scaling_factor
-        return hidden_states, residual
-
-    def _forward_ms_op_post_attn_layernorm(
-        self,
-        hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ):
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        return hidden_states, residual
-
-    def _forward_op_gating(
-        self,
-        hidden_states: torch.Tensor,
-        attn_metadata: Optional[AttentionMetadata] = None,
-    ) -> torch.Tensor:
-        if attn_metadata is None:
-            attn_metadata = get_forward_context().attn_metadata
-        # when profile runs, force experts to load balanced tokens
-        # to avoid high memory consumption on a single rank.
-        enable_force_load_balance = get_forward_context().in_profile_run
-
-        num_tokens, hidden_dim = hidden_states.shape
-
-        if self.tp_size > 1:
-            # pass
-            num_tokens, hidden_size = hidden_states.shape
-            if num_tokens < self.tp_size:
-                hidden_states = nn.functional.pad(
-                    hidden_states, (0, 0, 0, self.tp_size - num_tokens))
-            chunk_hidden_states = torch.tensor_split(hidden_states,
-                                                     self.tp_size,
-                                                     dim=0)
-            chunked_hidden_states_sizes = [
-                x.shape[0] for x in chunk_hidden_states
-            ]
-            local_hidden_states = chunk_hidden_states[self.tp_rank]
-        else:
-            local_hidden_states = hidden_states
-            chunked_hidden_states_sizes = None
-
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.mlp.gate(local_hidden_states)
-
-        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
-        mlp_config = self.config
-        if mlp_config.num_experts == 256:
-            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
-                router_logits,
-                k=mlp_config.num_experts_per_tok,  # topk当前写8
-                bias=self.mlp.gate.e_score_correction_bias,
-                k_group=mlp_config.topk_group,  # fix: 4
-                group_count=mlp_config.n_group,  # fix 8
-                group_select_mode=1,  # 0: max in group; 1: topk2.sum(fix)
-                renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
-                norm_type=1,  # 0: softmax; 1: sigmoid(fix)
-                routed_scaling_factor=1,
-                eps=float(1e-20),
-            )
-        else:
-            topk_weights, topk_ids = select_experts(
-                hidden_states=local_hidden_states,
-                router_logits=router_logits,
-                top_k=mlp_config.num_experts_per_tok,
-                use_grouped_topk=False,
-                renormalize=mlp_config.norm_topk_prob,
-                topk_group=getattr(mlp_config, "topk_group", None),
-                num_expert_group=getattr(mlp_config, "n_group", None),
-                custom_routing_function=None,
-                scoring_func=getattr(mlp_config, "scoring_func", "softmax"),
-                e_score_correction_bias=getattr(self.mlp.gate,
-                                                "e_score_correction_bias",
-                                                None),
-            )
-
-        topk_weights = topk_weights.to(hidden_states.dtype)
-        # this is a naive implementation for experts load balance so as
-        # to avoid accumulating too much tokens on a single rank.
-        # currently it is only activated when doing profile runs.
-        if enable_force_load_balance:
-            topk_ids = torch.randint_like(topk_ids, 0, self.config.num_experts)
-
-        return topk_weights, topk_ids, local_hidden_states, chunked_hidden_states_sizes
-
-    def _forward_op_grouped_mlp(self, dispatched_input, tokens_per_expert):
-        return apply_mlp(
-            dispatched_input,
-            self.mlp.experts.w13_weight,
-            self.mlp.experts.w2_weight,
-            tokens_per_expert,
-        )
-
-    def _forward_combine_comm(self, hidden_states, microbatch_id, num_tokens,
-                              chunked_hidden_states_sizes):
-        token_dispatcher = self.mlp.experts.token_dispatchers[microbatch_id]
-        final_hidden_states, _ = token_dispatcher.token_unpermutation(
-            hidden_states)
-        if hasattr(self.mlp, "routed_scaling_factor"):
-            final_hidden_states = final_hidden_states * self.mlp.routed_scaling_factor
-
-        if self.tp_size > 1:
-            final_hidden_states = gather_from_sequence_parallel_region(
-                final_hidden_states, self.tp_group,
-                chunked_hidden_states_sizes)
-            if num_tokens < self.tp_size:
-                final_hidden_states = final_hidden_states[:num_tokens]
-
-        if hasattr(self.mlp, "shared_experts"):
-            final_hidden_states = (
-                final_hidden_states +
-                token_dispatcher.cached_shared_expert_output)
-            token_dispatcher.cached_shared_expert_output.untyped_storage(
-            ).resize_(0)
-            token_dispatcher.cached_shared_expert_output = None
-
-        final_hidden_states = final_hidden_states.view(num_tokens, -1)
-
-        return final_hidden_states
-
-    def _forward_ms_layer_alltoallv_finegrained(
-        self,
-        positions: List[torch.Tensor],
-        hidden_states: List[torch.Tensor],
-        residual: List[torch.Tensor],
-        attn_metadata: List[AttentionMetadata],
-        kv_cache: Optional[torch.Tensor] = None,
-    ):
-        layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
-        )
-        assert layer_index >= 0 and ms_metadata is not None
-        num_micro_batchs = ms_metadata.ms_config.num_micro_batches
-        assert len(positions) == num_micro_batchs
-        assert len(hidden_states) == num_micro_batchs
-        assert residual is not None
-        assert attn_metadata is not None
-        num_tokens = [None] * num_micro_batchs
-        hidden_dims = [None] * num_micro_batchs
-        topk_weights, topk_ids = [None] * num_micro_batchs, [
-            None
-        ] * num_micro_batchs
-        tokens_per_expert = [None] * num_micro_batchs
-        dispatched_input = [None] * num_micro_batchs
-        router_expert_output = [None] * num_micro_batchs
-        chunked_hidden_states_sizes = [None] * num_micro_batchs
-        token_dispatchers = self.mlp.experts.token_dispatchers
-
-        def discard_tensor(tensor):
-            if isinstance(tensor, torch.Tensor):
-                tensor = [tensor]
-            for t in tensor:
-                t.untyped_storage().resize_(0)
-
-        # block 1 : attention
-        # block 2 : Router Gating
-        # block 3 : Token DisPatch
-        # the attn computation of microbatch 1 can be overlapped with the moe
-        # communication in the previous layer, and the attn computation of microbatch 2
-        # can be overlapped with the attn communication of microbatch 1
-        for i in range(num_micro_batchs):
-            forward_context = get_forward_context()
-            layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
-            )
-            ms_metadata.try_wait_event(layer_index - 1, i,
-                                       MSEventKey.FFN_AR_FINISH)
-            forward_context.attn_metadata = attn_metadata[i]
-
-            # input layernorm
-            hidden_states[i], residual[
-                i] = self._forward_ms_op_input_layernorm(
-                    hidden_states[i], residual[i])
-            # attention and tp allreduce
-            hidden_states[i], residual[i] = self._forward_ms_op_attn(
-                positions[i], hidden_states[i], residual[i], kv_cache,
-                attn_metadata[i])
-            # post attention layer norm
-            hidden_states[i], residual[
-                i] = self._forward_ms_op_post_attn_layernorm(
-                    hidden_states[i], residual[i])
-            num_tokens[i], hidden_dims[i] = hidden_states[i].shape
-            # If TP is enabled, hidden_states will be chunked.
-            (
-                topk_weights[i],
-                topk_ids[i],
-                dispatched_input[i],
-                chunked_hidden_states_sizes[i],
-            ) = self._forward_op_gating(hidden_states[i], attn_metadata[i])
-            token_dispatchers[i].preprocess_and_permtute1(
-                dispatched_input[i],
-                topk_weights[i],
-                topk_ids[i],
-                shared_experts=None,
-                shared_experts_input=None,
-            )
-            # Launch DisPatch Comm in a New Stream.
-            dispatch_context = MultiStreamStepMetadata(
-                comm_stream=ms_metadata.communicate_stream,
-                before_comm_event=ms_metadata.ms_events[layer_index][i][
-                    MSEventKey.MOE_BEFORE_COMM],
-                after_comm_event=ms_metadata.ms_events[layer_index][i][
-                    MSEventKey.MOE_AFTER_COMM],
-            )
-            dispatch_context.before_comm_event.record()
-            # print_with_sync(f'begin token dispatch{i}...', torch.distributed.get_rank())
-            with torch.npu.stream(dispatch_context.comm_stream):
-                dispatch_context.comm_stream.wait_event(
-                    dispatch_context.before_comm_event)
-                token_dispatchers[i].dispatch_alltoall()
-                dispatched_input[i], tokens_per_expert[i] = token_dispatchers[
-                    i].permute2()
-                dispatch_context.after_comm_event.record()
-
-        # print_with_sync('begin experts...', torch.distributed.get_rank())
-        # block 4 : Router Experts Computation
-        # block 5 : Token Combine Communication
-        for i in range(num_micro_batchs):
-            ms_metadata.try_wait_event(layer_index, i,
-                                       MSEventKey.MOE_AFTER_COMM)
-            discard_tensor(hidden_states[i])
-            router_expert_output[i] = self._forward_op_grouped_mlp(
-                dispatched_input[i], tokens_per_expert[i])
-            discard_tensor(dispatched_input[i])
-
-            # Launch Combine Comm in a New Stream.
-            combine_context = MultiStreamStepMetadata(
-                comm_stream=ms_metadata.communicate_stream,
-                before_comm_event=ms_metadata.ms_events[layer_index][i][
-                    MSEventKey.FFN_COM_FINISH],
-                after_comm_event=ms_metadata.ms_events[layer_index][i][
-                    MSEventKey.FFN_AR_FINISH],
-            )
-            combine_context.before_comm_event.record()
-            ms_metadata.try_wait_event(layer_index, i,
-                                       MSEventKey.MOE_SE_COMM_FINISH)
-            with torch.npu.stream(combine_context.comm_stream):
-                combine_context.comm_stream.wait_event(
-                    combine_context.before_comm_event)
-                hidden_states[i] = self._forward_combine_comm(
-                    router_expert_output[i],
-                    i,
-                    num_tokens[i],
-                    chunked_hidden_states_sizes[i],
-                )
-                ms_metadata.ms_events[layer_index][i][
-                    MSEventKey.
-                    FFN_AR_FINISH] = combine_context.comm_stream.record_event(
-                    )
-
-        return hidden_states, residual
-
-
-@support_torch_compile
-class CustomQwen3DBOMoEModel(Qwen3MoeModel):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        nn.Module.__init__(self)
-
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.config = config
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-            prefix=f"{prefix}.embed_tokens")
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: Qwen3MoeDecoderLayerDBO(
-                config=config,
-                cache_config=cache_config,
-                quant_config=quant_config,
-                prefix=prefix,
-            ),
-            prefix=f"{prefix}.layers",
-        )
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
-            ["hidden_states", "residual"], config.hidden_size)
-
-        # dbo related members
-        if VLLM_ASCEND_ENABLE_DBO:
-            self.use_mla = False
-            self.multistream_config = MultiStreamConfig()
-            multistream_metadata = make_multistream_metadata_ds(
-                start_layer=self.start_layer,
-                end_layer=self.end_layer,
-                causal_lm=getattr(config, "causal_lm", True),
-                multistream_config=self.multistream_config,
-            )
-            self.ms_pre_layer = MultiStreamPreTransformerLayer(
-                multistream_metadata)
-            self.ms_post_layer = MultiStreamPostTransformerLayer(
-                multistream_metadata)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if get_pp_group().is_first_rank:
-            if inputs_embeds is not None:
-                hidden_states = inputs_embeds
-            else:
-                hidden_states = self.get_input_embeddings(input_ids)
-            residual = None
-        else:
-            assert intermediate_tensors is not None
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-
-        num_normal_layers = (0 if VLLM_ASCEND_ENABLE_DBO and self.can_run_ms()
-                             else self.end_layer - self.start_layer)
-
-        moe_start_layer = self.start_layer + num_normal_layers
-        for i in range(self.start_layer, min(moe_start_layer, self.end_layer)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states, residual)
-
-        if moe_start_layer < self.end_layer:
-            # if we enable multistream/dbo, process sparse layers here
-            hidden_states, residual = self._forward_ms_layers(
-                positions=positions,
-                hidden_states=hidden_states,
-                residual=residual,
-                moe_start_layer=moe_start_layer,
-            )
-
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
-
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-    def can_run_ms(self):
-        attn_metadata = get_forward_context().attn_metadata
-        # enable prefill overlap
-        with_prefill = get_forward_context().with_prefill
-        if (attn_metadata is None or not with_prefill
-                or not attn_metadata.enable_dbo_across_dp):
-            return False
-
-        return True
-
-    def _forward_ms_layers(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        residual: torch.Tensor,
-        moe_start_layer: int,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-    ):
-
-        if moe_start_layer == self.end_layer:
-            return hidden_states, residual
-
-        attn_metadata, [positions, hidden_states,
-                        residual] = self.ms_pre_layer(
-                            [positions, hidden_states, residual], )
-        num_micro_batch = len(attn_metadata)
-        # the rest layers
-        for i in range(moe_start_layer, self.end_layer):
-            layer = self.layers[i]
-            ms_layer_forward_func = layer._forward_ms_layer_alltoallv_finegrained
-            # print("get_called......")
-            hidden_states, residual = ms_layer_forward_func(
-                positions=positions,
-                hidden_states=hidden_states,
-                residual=residual,
-                attn_metadata=attn_metadata,
-            )
-            advance_step_multistream_layer_context()
-
-        layer_index, ms_metadata, attn_metadata = get_multistream_layer_context(
-        )
-        for i in range(num_micro_batch):
-            ms_metadata.try_wait_event(layer_index - 1, i,
-                                       MSEventKey.FFN_AR_FINISH)
-
-        [hidden_states,
-         residual] = self.ms_post_layer([hidden_states, residual], )
-        return hidden_states, residual
-
-
-class CustomQwen3MoeForCausalLMDBO(Qwen3MoeForCausalLM):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
-    }
-    qwen3.Qwen3MoeSparseMoeBlock = AscendSparseMoeBlock
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        nn.Module.__init__(self)
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = CustomQwen3DBOMoEModel(vllm_config=vllm_config,
-                                            prefix=maybe_prefix(
-                                                prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def forward(self, *args, **kwargs):
-        if "graph_enable" in kwargs:
-            kwargs.pop("graph_enable")
-        return super().forward(*args, **kwargs)
diff --git a/vllm_ascend/multistream/ms_split.py b/vllm_ascend/multistream/ms_split.py
index 7d58506f662..3af6337e473 100644
--- a/vllm_ascend/multistream/ms_split.py
+++ b/vllm_ascend/multistream/ms_split.py
@@ -4,8 +4,7 @@
 import numpy as np
 import torch
 
-from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
-                                                AscendMetadata)
+from vllm_ascend.attention.attention_v1 import AscendAttentionState
 
 from .base import MSAttentionMetadataSplitConfig
 

From 8363a8fb162e51930a7c45009e5e5d2af2adce21 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 10:39:50 +0800
Subject: [PATCH 26/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/attention/attention_v1.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index c2ecebb100a..668c802c400 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -28,7 +28,6 @@
 from vllm.utils import direct_register_custom_op
 from vllm.v1.core.sched.output import SchedulerOutput
 
-from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
                                nd_to_nz_2d, nd_to_nz_spec)

From 1b4eaf648a90e6ea27d38091052a1691dbd3f32f Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 10:44:16 +0800
Subject: [PATCH 27/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 .../test_offline_inference_distributed.py      | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index fda118d2bca..c2d76eda956 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -119,6 +119,24 @@ def test_models_distributed_DeepSeekV3_dbo():
 
 
 def test_models_distributed_pangu():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+
+    with VllmRunner(
+            snapshot_download("vllm-ascend/pangu-pro-moe-pruing"),
+            max_model_len=8192,
+            enforce_eager=True,
+            dtype="auto",
+            tensor_parallel_size=2,
+            distributed_executor_backend="mp",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@ patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION": "1"})
+def test_models_distributed_topk() -> None:
     example_prompts = [
         "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
         "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",

From a819a339c6f586ad9ca9ecf6dfb014207e7b4136 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 10:45:33 +0800
Subject: [PATCH 28/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/e2e/multicard/test_offline_inference_distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index c2d76eda956..fe5dd8c1c29 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -135,7 +135,7 @@ def test_models_distributed_pangu():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
-@ patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION": "1"})
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION": "1"})
 def test_models_distributed_topk() -> None:
     example_prompts = [
         "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",

From 6fb8ae040e65dc066ec92ed6306523981daf4670 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 14:06:52 +0800
Subject: [PATCH 29/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/e2e/multicard/test_offline_inference_distributed.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index fe5dd8c1c29..92629cdca51 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -142,7 +142,6 @@ def test_models_distributed_topk() -> None:
         "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
         "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
     ]
-    dtype = "half"
     sampling_params = SamplingParams(max_tokens=5,
                                      temperature=0.0,
                                      top_k=50,

From bedb8d373bc6ccd3995e9b36326b9e258850c318 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 15:24:30 +0800
Subject: [PATCH 30/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_token_dispatcher.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/tests/ut/test_token_dispatcher.py b/tests/ut/test_token_dispatcher.py
index 18768a7fe8f..a163080cda9 100644
--- a/tests/ut/test_token_dispatcher.py
+++ b/tests/ut/test_token_dispatcher.py
@@ -19,17 +19,14 @@
 
 import pytest
 from pytest_mock import MockerFixture
+from tests.ut.base import PytestBase
 
 from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
     MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig)
 from vllm_ascend.utils import adapt_patch  # noqa E402
 
-import vllm_ascend.patch.worker.patch_common.patch_utils  # type: ignore[import]  # isort: skip  # noqa
 
-adapt_patch(True)
-
-
-class TestMoEAlltoAllSeqOverLapDispatcher(unittest.TestCase):
+class TestMoEAlltoAllSeqOverLapDispatcher(PytestBase):
 
     @pytest.fixture
     def config(self):
@@ -60,10 +57,9 @@ def dispatcher(self, config, mocker: MockerFixture):
         return MoEAlltoAllSeqOverLapDispatcher(config)
 
     def test_initialization(self, dispatcher, config):
-        self.assertEqual(dispatcher.num_local_experts,
-                         config.num_local_experts)
-        self.assertEqual(dispatcher.num_experts, config.num_moe_experts)
-        self.assertEqual(dispatcher.local_expert_indices, [0, 1])
-        self.assertEqual(dispatcher.ep_rank, 0)
-        self.assertEqual(dispatcher.ep_size, 2)
-        self.assertIsNotNone(dispatcher.overlap_stream)
+        assert dispatcher.num_local_experts == config.num_local_experts
+        assert dispatcher.num_experts == config.num_moe_experts
+        assert dispatcher.local_expert_indices == [0, 1]
+        assert dispatcher.ep_rank == 0
+        assert dispatcher.ep_size == 2
+        assert dispatcher.overlap_stream is not None

From 288edf45636d5c4cc36f91bc1a9f3c1bbc8302cb Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 16:44:31 +0800
Subject: [PATCH 31/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_token_dispatcher.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/ut/test_token_dispatcher.py b/tests/ut/test_token_dispatcher.py
index a163080cda9..ccb26ff2f73 100644
--- a/tests/ut/test_token_dispatcher.py
+++ b/tests/ut/test_token_dispatcher.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 
-import unittest
-
 import pytest
 from pytest_mock import MockerFixture
 from tests.ut.base import PytestBase

From 509fe5c6cb910dbaaa47a7e584b42a2b57756066 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 16:45:10 +0800
Subject: [PATCH 32/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_token_dispatcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ut/test_token_dispatcher.py b/tests/ut/test_token_dispatcher.py
index ccb26ff2f73..ff1759d6b0b 100644
--- a/tests/ut/test_token_dispatcher.py
+++ b/tests/ut/test_token_dispatcher.py
@@ -17,8 +17,8 @@
 
 import pytest
 from pytest_mock import MockerFixture
-from tests.ut.base import PytestBase
 
+from tests.ut.base import PytestBase
 from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
     MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig)
 from vllm_ascend.utils import adapt_patch  # noqa E402

From 000dbcccd0e479f335f812d55a33818788e3db8c Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 19:09:36 +0800
Subject: [PATCH 33/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_distributed_tensor_parallel.py | 23 ++++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
index 5a438e0cdf9..bd4a41ebfac 100644
--- a/tests/ut/test_distributed_tensor_parallel.py
+++ b/tests/ut/test_distributed_tensor_parallel.py
@@ -16,10 +16,9 @@
 # This file is a part of the vllm-ascend project.
 
 import importlib
-import unittest
 from unittest.mock import MagicMock, patch
 
-import pytest
+from tests.ut.base import PytestBase
 import torch
 
 from vllm_ascend.distributed.tensor_parallel import (
@@ -51,7 +50,7 @@ def mock_dist():
         yield mock
 
 
-class TestDistributedCommunication(unittest.TestCase):
+class TestDistributedCommunication(PytestBase):
 
     @pytest.mark.parametrize("world_size", [1, 4])
     def test_gather_along_first_dim(self, test_tensor, mock_group, mock_dist,
@@ -62,9 +61,9 @@ def test_gather_along_first_dim(self, test_tensor, mock_group, mock_dist,
         result = _gather_along_first_dim(test_tensor, mock_group)
 
         if world_size == 1:
-            self.assertEqual(result.shape, (8, 16))
+            assert result.shape == (8, 16)
         else:
-            self.assertEqual(result.shape, (32, 16))  # 8*4=32
+            assert result.shape == (32, 16)  # 8*4=32
 
     def test_gather_along_first_dim_unequal_split(self, test_tensor,
                                                   mock_group):
@@ -72,7 +71,7 @@ def test_gather_along_first_dim_unequal_split(self, test_tensor,
         output_split_sizes = [5, 10, 15, 2]
         result = _gather_along_first_dim(test_tensor, mock_group,
                                          output_split_sizes)
-        self.assertEqual(result.shape, (32, 16))  # 5+10+15+2=32
+        assert result.shape == (32, 16)  # 5+10+15+2=32
 
     @pytest.mark.parametrize("world_size", [1, 4])
     def test_gather_along_last_dim(self, test_tensor_last_dim, mock_group,
@@ -82,7 +81,7 @@ def test_gather_along_last_dim(self, test_tensor_last_dim, mock_group,
 
         result = _gather_along_last_dim(test_tensor_last_dim, mock_group)
 
-        self.assertEqual(result.shape, (8, 16, 32 * world_size))
+        assert result.shape == (8, 16, 32 * world_size)
 
     @pytest.mark.parametrize("input_shape,expected_shape", [
         ((32, 16), (8, 16)),
@@ -92,12 +91,12 @@ def test_reduce_scatter_along_first_dim(self, mock_group, input_shape,
                                             expected_shape):
         input_tensor = torch.randn(*input_shape)
         result = _reduce_scatter_along_first_dim(input_tensor, mock_group)
-        self.assertEqual(result.shape, expected_shape)
+        assert result.shape == expected_shape
 
     def test_reduce_scatter_along_last_dim(self, mock_group):
         input_tensor = torch.randn(8, 16, 32)
         result = _reduce_scatter_along_last_dim(input_tensor, mock_group)
-        self.assertEqual(result.shape, (8, 16, 8))
+        assert result.shape == (8, 16, 8)
 
     @pytest.mark.parametrize("func,input_shape,expected_shape", [
         ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
@@ -116,7 +115,7 @@ def test_wrapper_functions(self, mock_group, func, input_shape,
         test_func = globals[func]
         input_tensor = torch.randn(*input_shape)
         result = test_func(input_tensor, mock_group)
-        self.assertEqual(result.shape, expected_shape)
+        assert result.shape == expected_shape
 
     @pytest.mark.parametrize(
         "input_shape,output_shape",
@@ -126,7 +125,7 @@ def test_wrapper_functions(self, mock_group, func, input_shape,
     def test_all_to_all_sp2hp(self, mock_group, input_shape, output_shape):
         input_tensor = torch.randn(*input_shape)
         result = all_to_all_sp2hp(input_tensor, mock_group)
-        self.assertEqual(result.shape, output_shape)
+        assert result.shape == output_shape
 
     @pytest.mark.parametrize(
         "input_shape,output_shape",
@@ -136,4 +135,4 @@ def test_all_to_all_sp2hp(self, mock_group, input_shape, output_shape):
     def test_all_to_all_hp2sp(self, mock_group, input_shape, output_shape):
         input_tensor = torch.randn(*input_shape)
         result = all_to_all_hp2sp(input_tensor, mock_group)
-        self.assertEqual(result.shape, output_shape)
+        assert result.shape == output_shape

From ecd33b13d5153e3b25ee5cbe6a099e20ef39a2e3 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 19:12:53 +0800
Subject: [PATCH 34/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_distributed_tensor_parallel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
index bd4a41ebfac..7db25f4fb8f 100644
--- a/tests/ut/test_distributed_tensor_parallel.py
+++ b/tests/ut/test_distributed_tensor_parallel.py
@@ -18,9 +18,10 @@
 import importlib
 from unittest.mock import MagicMock, patch
 
-from tests.ut.base import PytestBase
+import pytest
 import torch
 
+from tests.ut.base import PytestBase
 from vllm_ascend.distributed.tensor_parallel import (
     _gather_along_first_dim, _gather_along_last_dim,
     _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,

From c23a6bfef59fcfb6598daae20be0cdce88060daf Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 20:03:45 +0800
Subject: [PATCH 35/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_distributed_tensor_parallel.py | 178 ++++++++-----------
 1 file changed, 78 insertions(+), 100 deletions(-)

diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
index 7db25f4fb8f..ddf1000fb02 100644
--- a/tests/ut/test_distributed_tensor_parallel.py
+++ b/tests/ut/test_distributed_tensor_parallel.py
@@ -14,14 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
-
-import importlib
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
 import pytest
 import torch
-
-from tests.ut.base import PytestBase
 from vllm_ascend.distributed.tensor_parallel import (
     _gather_along_first_dim, _gather_along_last_dim,
     _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
@@ -39,101 +35,83 @@ def test_tensor_last_dim():
 
 
 @pytest.fixture
-def mock_group():
-    return MagicMock()
+def mock_group(monkeypatch):
+    group = MagicMock()
+
+    # 模拟 torch.distributed 函数
+    monkeypatch.setattr(torch.distributed, "get_world_size", lambda *_: 4)
+    monkeypatch.setattr(torch.distributed, "get_rank", lambda *_: 0)
+
+    return group
 
 
 @pytest.fixture(autouse=True)
-def mock_dist():
-    with patch("torch.distributed") as mock:
-        mock.get_world_size.return_value = 4
-        mock.get_rank.return_value = 0
-        yield mock
-
-
-class TestDistributedCommunication(PytestBase):
-
-    @pytest.mark.parametrize("world_size", [1, 4])
-    def test_gather_along_first_dim(self, test_tensor, mock_group, mock_dist,
-                                    world_size):
-        """test _gather_along_first_dim"""
-        mock_dist.get_world_size.return_value = world_size
-
-        result = _gather_along_first_dim(test_tensor, mock_group)
-
-        if world_size == 1:
-            assert result.shape == (8, 16)
-        else:
-            assert result.shape == (32, 16)  # 8*4=32
-
-    def test_gather_along_first_dim_unequal_split(self, test_tensor,
-                                                  mock_group):
-        """test unequal split"""
-        output_split_sizes = [5, 10, 15, 2]
-        result = _gather_along_first_dim(test_tensor, mock_group,
-                                         output_split_sizes)
-        assert result.shape == (32, 16)  # 5+10+15+2=32
-
-    @pytest.mark.parametrize("world_size", [1, 4])
-    def test_gather_along_last_dim(self, test_tensor_last_dim, mock_group,
-                                   mock_dist, world_size):
-        """test _gather_along_last_dim"""
-        mock_dist.get_world_size.return_value = world_size
-
-        result = _gather_along_last_dim(test_tensor_last_dim, mock_group)
-
-        assert result.shape == (8, 16, 32 * world_size)
-
-    @pytest.mark.parametrize("input_shape,expected_shape", [
-        ((32, 16), (8, 16)),
-        ((40, 10), (10, 10)),
-    ])
-    def test_reduce_scatter_along_first_dim(self, mock_group, input_shape,
-                                            expected_shape):
-        input_tensor = torch.randn(*input_shape)
-        result = _reduce_scatter_along_first_dim(input_tensor, mock_group)
-        assert result.shape == expected_shape
-
-    def test_reduce_scatter_along_last_dim(self, mock_group):
-        input_tensor = torch.randn(8, 16, 32)
-        result = _reduce_scatter_along_last_dim(input_tensor, mock_group)
-        assert result.shape == (8, 16, 8)
-
-    @pytest.mark.parametrize("func,input_shape,expected_shape", [
-        ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
-         (8, 16, 128)),
-        ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
-        ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
-         (8, 16, 8)),
-        ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
-    ])
-    def test_wrapper_functions(self, mock_group, func, input_shape,
-                               expected_shape):
-        """test wrapper funcs"""
-        mod = importlib.import_module(
-            'vllm_ascend.distributed.tensor_parallel')
-        globals = mod.__dict__
-        test_func = globals[func]
-        input_tensor = torch.randn(*input_shape)
-        result = test_func(input_tensor, mock_group)
-        assert result.shape == expected_shape
-
-    @pytest.mark.parametrize(
-        "input_shape,output_shape",
-        [
-            ((8, 16), (32, 4)),  # [num_tokens/TP, H] -> [num_tokens, H/TP]
-        ])
-    def test_all_to_all_sp2hp(self, mock_group, input_shape, output_shape):
-        input_tensor = torch.randn(*input_shape)
-        result = all_to_all_sp2hp(input_tensor, mock_group)
-        assert result.shape == output_shape
-
-    @pytest.mark.parametrize(
-        "input_shape,output_shape",
-        [
-            ((32, 4), (8, 16)),  # [num_tokens, H/TP] -> [num_tokens/TP, H]
-        ])
-    def test_all_to_all_hp2sp(self, mock_group, input_shape, output_shape):
-        input_tensor = torch.randn(*input_shape)
-        result = all_to_all_hp2sp(input_tensor, mock_group)
-        assert result.shape == output_shape
+def mock_npu_device(monkeypatch):
+    monkeypatch.setattr(torch.npu, "current_device", lambda: 0)
+
+
+def test_gather_along_first_dim(test_tensor, mock_group):
+    result = _gather_along_first_dim(test_tensor, mock_group)
+    assert result.shape == (32, 16)  # 8*4=32
+
+    output_split_sizes = [5, 10, 15, 2]
+    result = _gather_along_first_dim(test_tensor, mock_group, output_split_sizes)
+    assert result.shape == (32, 16)  # 5+10+15+2=32
+
+
+def test_gather_along_last_dim(test_tensor_last_dim, mock_group):
+    result = _gather_along_last_dim(test_tensor_last_dim, mock_group)
+    assert result.shape == (8, 16, 128)  # 32*4=128
+
+
+@pytest.mark.parametrize("input_shape,expected_shape", [
+    ((32, 16), (8, 16)),
+    ((40, 10), (10, 10)),
+])
+def test_reduce_scatter_along_first_dim(mock_group, input_shape, expected_shape):
+    input_tensor = torch.randn(*input_shape)
+    result = _reduce_scatter_along_first_dim(input_tensor, mock_group)
+    assert result.shape == expected_shape
+
+
+def test_reduce_scatter_along_last_dim(mock_group):
+    input_tensor = torch.randn(8, 16, 32)
+    result = _reduce_scatter_along_last_dim(input_tensor, mock_group)
+    assert result.shape == (8, 16, 8)  # 32/4=8
+
+
+@pytest.mark.parametrize("func,input_shape,expected_shape", [
+    ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
+     (8, 16, 128)),
+    ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
+    ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
+     (8, 16, 8)),
+    ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
+])
+def test_wrapper_functions(mock_group, func, input_shape, expected_shape):
+    from vllm_ascend.distributed import tensor_parallel as tp
+    test_func = getattr(tp, func)
+
+    input_tensor = torch.randn(*input_shape)
+    result = test_func(input_tensor, mock_group)
+    assert result.shape == expected_shape
+
+
+@pytest.mark.parametrize(
+    "input_shape,output_shape",
+    [((8, 16), (32, 4))]  # [num_tokens/TP, H] -> [num_tokens, H/TP]
+)
+def test_all_to_all_sp2hp(mock_group, input_shape, output_shape):
+    input_tensor = torch.randn(*input_shape)
+    result = all_to_all_sp2hp(input_tensor, mock_group)
+    assert result.shape == output_shape
+
+
+@pytest.mark.parametrize(
+    "input_shape,output_shape",
+    [((32, 4), (8, 16))]  # [num_tokens, H/TP] -> [num_tokens/TP, H]
+)
+def test_all_to_all_hp2sp(mock_group, input_shape, output_shape):
+    input_tensor = torch.randn(*input_shape)
+    result = all_to_all_hp2sp(input_tensor, mock_group)
+    assert result.shape == output_shape
\ No newline at end of file

From 31615e565a6a3dc171ab228a1b290977c28098fb Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 20:06:05 +0800
Subject: [PATCH 36/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_distributed_tensor_parallel.py | 178 +++++++++++--------
 1 file changed, 100 insertions(+), 78 deletions(-)

diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
index ddf1000fb02..7db25f4fb8f 100644
--- a/tests/ut/test_distributed_tensor_parallel.py
+++ b/tests/ut/test_distributed_tensor_parallel.py
@@ -14,10 +14,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
-from unittest.mock import MagicMock
+
+import importlib
+from unittest.mock import MagicMock, patch
 
 import pytest
 import torch
+
+from tests.ut.base import PytestBase
 from vllm_ascend.distributed.tensor_parallel import (
     _gather_along_first_dim, _gather_along_last_dim,
     _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
@@ -35,83 +39,101 @@ def test_tensor_last_dim():
 
 
 @pytest.fixture
-def mock_group(monkeypatch):
-    group = MagicMock()
-
-    # 模拟 torch.distributed 函数
-    monkeypatch.setattr(torch.distributed, "get_world_size", lambda *_: 4)
-    monkeypatch.setattr(torch.distributed, "get_rank", lambda *_: 0)
-
-    return group
+def mock_group():
+    return MagicMock()
 
 
 @pytest.fixture(autouse=True)
-def mock_npu_device(monkeypatch):
-    monkeypatch.setattr(torch.npu, "current_device", lambda: 0)
-
-
-def test_gather_along_first_dim(test_tensor, mock_group):
-    result = _gather_along_first_dim(test_tensor, mock_group)
-    assert result.shape == (32, 16)  # 8*4=32
-
-    output_split_sizes = [5, 10, 15, 2]
-    result = _gather_along_first_dim(test_tensor, mock_group, output_split_sizes)
-    assert result.shape == (32, 16)  # 5+10+15+2=32
-
-
-def test_gather_along_last_dim(test_tensor_last_dim, mock_group):
-    result = _gather_along_last_dim(test_tensor_last_dim, mock_group)
-    assert result.shape == (8, 16, 128)  # 32*4=128
-
-
-@pytest.mark.parametrize("input_shape,expected_shape", [
-    ((32, 16), (8, 16)),
-    ((40, 10), (10, 10)),
-])
-def test_reduce_scatter_along_first_dim(mock_group, input_shape, expected_shape):
-    input_tensor = torch.randn(*input_shape)
-    result = _reduce_scatter_along_first_dim(input_tensor, mock_group)
-    assert result.shape == expected_shape
-
-
-def test_reduce_scatter_along_last_dim(mock_group):
-    input_tensor = torch.randn(8, 16, 32)
-    result = _reduce_scatter_along_last_dim(input_tensor, mock_group)
-    assert result.shape == (8, 16, 8)  # 32/4=8
-
-
-@pytest.mark.parametrize("func,input_shape,expected_shape", [
-    ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
-     (8, 16, 128)),
-    ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
-    ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
-     (8, 16, 8)),
-    ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
-])
-def test_wrapper_functions(mock_group, func, input_shape, expected_shape):
-    from vllm_ascend.distributed import tensor_parallel as tp
-    test_func = getattr(tp, func)
-
-    input_tensor = torch.randn(*input_shape)
-    result = test_func(input_tensor, mock_group)
-    assert result.shape == expected_shape
-
-
-@pytest.mark.parametrize(
-    "input_shape,output_shape",
-    [((8, 16), (32, 4))]  # [num_tokens/TP, H] -> [num_tokens, H/TP]
-)
-def test_all_to_all_sp2hp(mock_group, input_shape, output_shape):
-    input_tensor = torch.randn(*input_shape)
-    result = all_to_all_sp2hp(input_tensor, mock_group)
-    assert result.shape == output_shape
-
-
-@pytest.mark.parametrize(
-    "input_shape,output_shape",
-    [((32, 4), (8, 16))]  # [num_tokens, H/TP] -> [num_tokens/TP, H]
-)
-def test_all_to_all_hp2sp(mock_group, input_shape, output_shape):
-    input_tensor = torch.randn(*input_shape)
-    result = all_to_all_hp2sp(input_tensor, mock_group)
-    assert result.shape == output_shape
\ No newline at end of file
+def mock_dist():
+    with patch("torch.distributed") as mock:
+        mock.get_world_size.return_value = 4
+        mock.get_rank.return_value = 0
+        yield mock
+
+
+class TestDistributedCommunication(PytestBase):
+
+    @pytest.mark.parametrize("world_size", [1, 4])
+    def test_gather_along_first_dim(self, test_tensor, mock_group, mock_dist,
+                                    world_size):
+        """test _gather_along_first_dim"""
+        mock_dist.get_world_size.return_value = world_size
+
+        result = _gather_along_first_dim(test_tensor, mock_group)
+
+        if world_size == 1:
+            assert result.shape == (8, 16)
+        else:
+            assert result.shape == (32, 16)  # 8*4=32
+
+    def test_gather_along_first_dim_unequal_split(self, test_tensor,
+                                                  mock_group):
+        """test unequal split"""
+        output_split_sizes = [5, 10, 15, 2]
+        result = _gather_along_first_dim(test_tensor, mock_group,
+                                         output_split_sizes)
+        assert result.shape == (32, 16)  # 5+10+15+2=32
+
+    @pytest.mark.parametrize("world_size", [1, 4])
+    def test_gather_along_last_dim(self, test_tensor_last_dim, mock_group,
+                                   mock_dist, world_size):
+        """test _gather_along_last_dim"""
+        mock_dist.get_world_size.return_value = world_size
+
+        result = _gather_along_last_dim(test_tensor_last_dim, mock_group)
+
+        assert result.shape == (8, 16, 32 * world_size)
+
+    @pytest.mark.parametrize("input_shape,expected_shape", [
+        ((32, 16), (8, 16)),
+        ((40, 10), (10, 10)),
+    ])
+    def test_reduce_scatter_along_first_dim(self, mock_group, input_shape,
+                                            expected_shape):
+        input_tensor = torch.randn(*input_shape)
+        result = _reduce_scatter_along_first_dim(input_tensor, mock_group)
+        assert result.shape == expected_shape
+
+    def test_reduce_scatter_along_last_dim(self, mock_group):
+        input_tensor = torch.randn(8, 16, 32)
+        result = _reduce_scatter_along_last_dim(input_tensor, mock_group)
+        assert result.shape == (8, 16, 8)
+
+    @pytest.mark.parametrize("func,input_shape,expected_shape", [
+        ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
+         (8, 16, 128)),
+        ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
+        ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
+         (8, 16, 8)),
+        ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
+    ])
+    def test_wrapper_functions(self, mock_group, func, input_shape,
+                               expected_shape):
+        """test wrapper funcs"""
+        mod = importlib.import_module(
+            'vllm_ascend.distributed.tensor_parallel')
+        globals = mod.__dict__
+        test_func = globals[func]
+        input_tensor = torch.randn(*input_shape)
+        result = test_func(input_tensor, mock_group)
+        assert result.shape == expected_shape
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [
+            ((8, 16), (32, 4)),  # [num_tokens/TP, H] -> [num_tokens, H/TP]
+        ])
+    def test_all_to_all_sp2hp(self, mock_group, input_shape, output_shape):
+        input_tensor = torch.randn(*input_shape)
+        result = all_to_all_sp2hp(input_tensor, mock_group)
+        assert result.shape == output_shape
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [
+            ((32, 4), (8, 16)),  # [num_tokens, H/TP] -> [num_tokens/TP, H]
+        ])
+    def test_all_to_all_hp2sp(self, mock_group, input_shape, output_shape):
+        input_tensor = torch.randn(*input_shape)
+        result = all_to_all_hp2sp(input_tensor, mock_group)
+        assert result.shape == output_shape

From 07e8dd89fee7346efae7f90f4aeea45415ec5975 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Wed, 30 Jul 2025 21:11:47 +0800
Subject: [PATCH 37/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_distributed_tensor_parallel.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
index 7db25f4fb8f..57544378e5c 100644
--- a/tests/ut/test_distributed_tensor_parallel.py
+++ b/tests/ut/test_distributed_tensor_parallel.py
@@ -16,6 +16,7 @@
 # This file is a part of the vllm-ascend project.
 
 import importlib
+from unittest import mock
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -53,6 +54,7 @@ def mock_dist():
 
 class TestDistributedCommunication(PytestBase):
 
+    @mock.patch("torch.npu.current_device", return_value="cpu")
     @pytest.mark.parametrize("world_size", [1, 4])
     def test_gather_along_first_dim(self, test_tensor, mock_group, mock_dist,
                                     world_size):
@@ -66,6 +68,7 @@ def test_gather_along_first_dim(self, test_tensor, mock_group, mock_dist,
         else:
             assert result.shape == (32, 16)  # 8*4=32
 
+    @mock.patch("torch.npu.current_device", return_value="cpu")
     def test_gather_along_first_dim_unequal_split(self, test_tensor,
                                                   mock_group):
         """test unequal split"""
@@ -74,6 +77,7 @@ def test_gather_along_first_dim_unequal_split(self, test_tensor,
                                          output_split_sizes)
         assert result.shape == (32, 16)  # 5+10+15+2=32
 
+    @mock.patch("torch.npu.current_device", return_value="cpu")
     @pytest.mark.parametrize("world_size", [1, 4])
     def test_gather_along_last_dim(self, test_tensor_last_dim, mock_group,
                                    mock_dist, world_size):
@@ -84,6 +88,7 @@ def test_gather_along_last_dim(self, test_tensor_last_dim, mock_group,
 
         assert result.shape == (8, 16, 32 * world_size)
 
+    @mock.patch("torch.npu.current_device", return_value="cpu")
     @pytest.mark.parametrize("input_shape,expected_shape", [
         ((32, 16), (8, 16)),
         ((40, 10), (10, 10)),
@@ -94,11 +99,13 @@ def test_reduce_scatter_along_first_dim(self, mock_group, input_shape,
         result = _reduce_scatter_along_first_dim(input_tensor, mock_group)
         assert result.shape == expected_shape
 
+    @mock.patch("torch.npu.current_device", return_value="cpu")
     def test_reduce_scatter_along_last_dim(self, mock_group):
         input_tensor = torch.randn(8, 16, 32)
         result = _reduce_scatter_along_last_dim(input_tensor, mock_group)
         assert result.shape == (8, 16, 8)
 
+    @mock.patch("torch.npu.current_device", return_value="cpu")
     @pytest.mark.parametrize("func,input_shape,expected_shape", [
         ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
          (8, 16, 128)),
@@ -118,6 +125,7 @@ def test_wrapper_functions(self, mock_group, func, input_shape,
         result = test_func(input_tensor, mock_group)
         assert result.shape == expected_shape
 
+    @mock.patch("torch.npu.current_device", return_value="cpu")
     @pytest.mark.parametrize(
         "input_shape,output_shape",
         [
@@ -128,6 +136,7 @@ def test_all_to_all_sp2hp(self, mock_group, input_shape, output_shape):
         result = all_to_all_sp2hp(input_tensor, mock_group)
         assert result.shape == output_shape
 
+    @mock.patch("torch.npu.current_device", return_value="cpu")
     @pytest.mark.parametrize(
         "input_shape,output_shape",
         [

From 41f6a3601d7871d0ad1d21bcd67ef1678e7019c1 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 10:04:43 +0800
Subject: [PATCH 38/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_distributed_tensor_parallel.py | 136 +++++++++----------
 1 file changed, 67 insertions(+), 69 deletions(-)

diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
index 57544378e5c..826f95e744a 100644
--- a/tests/ut/test_distributed_tensor_parallel.py
+++ b/tests/ut/test_distributed_tensor_parallel.py
@@ -16,96 +16,96 @@
 # This file is a part of the vllm-ascend project.
 
 import importlib
-from unittest import mock
-from unittest.mock import MagicMock, patch
-
 import pytest
+from pytest_mock import MockerFixture
 import torch
 
 from tests.ut.base import PytestBase
 from vllm_ascend.distributed.tensor_parallel import (
-    _gather_along_first_dim, _gather_along_last_dim,
-    _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
-    all_to_all_hp2sp, all_to_all_sp2hp)
-
-
-@pytest.fixture
-def test_tensor():
-    return torch.randn(8, 16)
-
+_gather_along_first_dim, _gather_along_last_dim,
+_reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
+all_to_all_hp2sp, all_to_all_sp2hp)
 
 @pytest.fixture
 def test_tensor_last_dim():
     return torch.randn(8, 16, 32)
 
 
-@pytest.fixture
-def mock_group():
-    return MagicMock()
-
+class TestDistributedCommunication(PytestBase):
 
-@pytest.fixture(autouse=True)
-def mock_dist():
-    with patch("torch.distributed") as mock:
-        mock.get_world_size.return_value = 4
-        mock.get_rank.return_value = 0
-        yield mock
+    @pytest.fixture(autouse=True)
+    def context(self, mocker: MockerFixture):
+        mocker.patch(
+            "torch.npu.current_device",
+            return_value="cpu")
+        mocker.patch(
+            "torch.distributed.get_world_size",
+            return_value=4)
+
+        mocker.patch(
+            "torch.distributed.get_rank",
+            return_value=0)
+
+    @pytest.mark.parametrize("world_size, test_tensor, expected", [
+        (1, torch.randn(8, 16), (8, 16)),
+        (4, torch.randn(8, 16), (32, 16))
+    ])
+    def test_gather_along_first_dim(self, test_tensor, expected,
+                                    world_size, mocker: MockerFixture):
+        """test _gather_along_first_dim"""
+        mocker.patch(
+            "torch.distributed.get_world_size",
+            return_value=world_size)
 
+        result = _gather_along_first_dim(test_tensor, None)
 
-class TestDistributedCommunication(PytestBase):
+        assert result.shape == expected
 
-    @mock.patch("torch.npu.current_device", return_value="cpu")
-    @pytest.mark.parametrize("world_size", [1, 4])
-    def test_gather_along_first_dim(self, test_tensor, mock_group, mock_dist,
-                                    world_size):
+    @pytest.mark.parametrize("output_split_sizes, expected", [
+        ([5, 10, 15, 2], (32, 16)),
+    ])
+    def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
+                                    world_size, mocker: MockerFixture):
         """test _gather_along_first_dim"""
-        mock_dist.get_world_size.return_value = world_size
-
-        result = _gather_along_first_dim(test_tensor, mock_group)
-
-        if world_size == 1:
-            assert result.shape == (8, 16)
-        else:
-            assert result.shape == (32, 16)  # 8*4=32
-
-    @mock.patch("torch.npu.current_device", return_value="cpu")
-    def test_gather_along_first_dim_unequal_split(self, test_tensor,
-                                                  mock_group):
-        """test unequal split"""
-        output_split_sizes = [5, 10, 15, 2]
-        result = _gather_along_first_dim(test_tensor, mock_group,
-                                         output_split_sizes)
-        assert result.shape == (32, 16)  # 5+10+15+2=32
-
-    @mock.patch("torch.npu.current_device", return_value="cpu")
-    @pytest.mark.parametrize("world_size", [1, 4])
-    def test_gather_along_last_dim(self, test_tensor_last_dim, mock_group,
-                                   mock_dist, world_size):
+
+        result = _gather_along_first_dim(test_tensor, None)
+
+        assert result.shape == expected
+
+    @pytest.mark.parametrize("world_size, test_tensor, expected", [
+        (1, torch.randn(8, 16, 32), (8, 16, 32)),
+        (4, torch.randn(8, 16, 32), (8, 16, 32 * 4))
+    ])
+    def test_gather_along_last_dim(self, test_tensor, expected,
+                                    world_size, mocker: MockerFixture):
         """test _gather_along_last_dim"""
-        mock_dist.get_world_size.return_value = world_size
+        mocker.patch(
+            "torch.distributed.get_world_size",
+            return_value=world_size)
 
-        result = _gather_along_last_dim(test_tensor_last_dim, mock_group)
+        result = _gather_along_last_dim(test_tensor, None)
 
-        assert result.shape == (8, 16, 32 * world_size)
+        assert result.shape == expected
 
-    @mock.patch("torch.npu.current_device", return_value="cpu")
     @pytest.mark.parametrize("input_shape,expected_shape", [
         ((32, 16), (8, 16)),
         ((40, 10), (10, 10)),
     ])
-    def test_reduce_scatter_along_first_dim(self, mock_group, input_shape,
+    def test_reduce_scatter_along_first_dim(self, input_shape,
                                             expected_shape):
         input_tensor = torch.randn(*input_shape)
-        result = _reduce_scatter_along_first_dim(input_tensor, mock_group)
+        result = _reduce_scatter_along_first_dim(input_tensor, None)
         assert result.shape == expected_shape
 
-    @mock.patch("torch.npu.current_device", return_value="cpu")
-    def test_reduce_scatter_along_last_dim(self, mock_group):
-        input_tensor = torch.randn(8, 16, 32)
-        result = _reduce_scatter_along_last_dim(input_tensor, mock_group)
-        assert result.shape == (8, 16, 8)
+    @pytest.mark.parametrize("input_shape,expected_shape", [
+        ((8, 16, 32), (8, 16, 8)),
+    ])
+    def test_reduce_scatter_along_last_dim(self, input_shape,
+                                           expected_shape):
+        input_tensor = torch.randn(*input_shape)
+        result = _reduce_scatter_along_last_dim(input_tensor, None)
+        assert result.shape == expected_shape
 
-    @mock.patch("torch.npu.current_device", return_value="cpu")
     @pytest.mark.parametrize("func,input_shape,expected_shape", [
         ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
          (8, 16, 128)),
@@ -114,7 +114,7 @@ def test_reduce_scatter_along_last_dim(self, mock_group):
          (8, 16, 8)),
         ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
     ])
-    def test_wrapper_functions(self, mock_group, func, input_shape,
+    def test_wrapper_functions(self, func, input_shape,
                                expected_shape):
         """test wrapper funcs"""
         mod = importlib.import_module(
@@ -122,27 +122,25 @@ def test_wrapper_functions(self, mock_group, func, input_shape,
         globals = mod.__dict__
         test_func = globals[func]
         input_tensor = torch.randn(*input_shape)
-        result = test_func(input_tensor, mock_group)
+        result = test_func(input_tensor, None)
         assert result.shape == expected_shape
 
-    @mock.patch("torch.npu.current_device", return_value="cpu")
     @pytest.mark.parametrize(
         "input_shape,output_shape",
         [
             ((8, 16), (32, 4)),  # [num_tokens/TP, H] -> [num_tokens, H/TP]
         ])
-    def test_all_to_all_sp2hp(self, mock_group, input_shape, output_shape):
+    def test_all_to_all_sp2hp(self, input_shape, output_shape):
         input_tensor = torch.randn(*input_shape)
-        result = all_to_all_sp2hp(input_tensor, mock_group)
+        result = all_to_all_sp2hp(input_tensor, None)
         assert result.shape == output_shape
 
-    @mock.patch("torch.npu.current_device", return_value="cpu")
     @pytest.mark.parametrize(
         "input_shape,output_shape",
         [
             ((32, 4), (8, 16)),  # [num_tokens, H/TP] -> [num_tokens/TP, H]
         ])
-    def test_all_to_all_hp2sp(self, mock_group, input_shape, output_shape):
+    def test_all_to_all_hp2sp(self, input_shape, output_shape):
         input_tensor = torch.randn(*input_shape)
-        result = all_to_all_hp2sp(input_tensor, mock_group)
+        result = all_to_all_hp2sp(input_tensor, None)
         assert result.shape == output_shape

From fe081b4d66bca0aa2c32dcd66f66433949e99990 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 10:17:18 +0800
Subject: [PATCH 39/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_distributed_tensor_parallel.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
index 826f95e744a..22e7265cd03 100644
--- a/tests/ut/test_distributed_tensor_parallel.py
+++ b/tests/ut/test_distributed_tensor_parallel.py
@@ -22,13 +22,9 @@
 
 from tests.ut.base import PytestBase
 from vllm_ascend.distributed.tensor_parallel import (
-_gather_along_first_dim, _gather_along_last_dim,
-_reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
-all_to_all_hp2sp, all_to_all_sp2hp)
-
-@pytest.fixture
-def test_tensor_last_dim():
-    return torch.randn(8, 16, 32)
+    _gather_along_first_dim, _gather_along_last_dim,
+    _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
+    all_to_all_hp2sp, all_to_all_sp2hp)
 
 
 class TestDistributedCommunication(PytestBase):
@@ -61,14 +57,14 @@ def test_gather_along_first_dim(self, test_tensor, expected,
 
         assert result.shape == expected
 
-    @pytest.mark.parametrize("output_split_sizes, expected", [
-        ([5, 10, 15, 2], (32, 16)),
+    @pytest.mark.parametrize("test_tensor, output_split_sizes, expected", [
+        (torch.randn(8, 16), [5, 10, 15, 2], (32, 16)),
     ])
     def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
-                                    world_size, mocker: MockerFixture):
+                                                  output_split_sizes):
         """test _gather_along_first_dim"""
 
-        result = _gather_along_first_dim(test_tensor, None)
+        result = _gather_along_first_dim(test_tensor, None, output_split_sizes)
 
         assert result.shape == expected
 
@@ -77,7 +73,7 @@ def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
         (4, torch.randn(8, 16, 32), (8, 16, 32 * 4))
     ])
     def test_gather_along_last_dim(self, test_tensor, expected,
-                                    world_size, mocker: MockerFixture):
+                                   world_size, mocker: MockerFixture):
         """test _gather_along_last_dim"""
         mocker.patch(
             "torch.distributed.get_world_size",

From 109678944e369d2174772493dccfc93c3c9a813d Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 10:39:49 +0800
Subject: [PATCH 40/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_distributed_tensor_parallel.py | 29 ++++++++++----------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
index 22e7265cd03..4b035fd9869 100644
--- a/tests/ut/test_distributed_tensor_parallel.py
+++ b/tests/ut/test_distributed_tensor_parallel.py
@@ -16,6 +16,7 @@
 # This file is a part of the vllm-ascend project.
 
 import importlib
+
 import pytest
 from pytest_mock import MockerFixture
 import torch
@@ -53,7 +54,7 @@ def test_gather_along_first_dim(self, test_tensor, expected,
             "torch.distributed.get_world_size",
             return_value=world_size)
 
-        result = _gather_along_first_dim(test_tensor, None)
+        result = _gather_along_first_dim(test_tensor, mocker.MagicMock() )
 
         assert result.shape == expected
 
@@ -61,10 +62,10 @@ def test_gather_along_first_dim(self, test_tensor, expected,
         (torch.randn(8, 16), [5, 10, 15, 2], (32, 16)),
     ])
     def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
-                                                  output_split_sizes):
+                                                  output_split_sizes, mocker: MockerFixture):
         """test _gather_along_first_dim"""
 
-        result = _gather_along_first_dim(test_tensor, None, output_split_sizes)
+        result = _gather_along_first_dim(test_tensor, mocker.MagicMock() , output_split_sizes)
 
         assert result.shape == expected
 
@@ -79,7 +80,7 @@ def test_gather_along_last_dim(self, test_tensor, expected,
             "torch.distributed.get_world_size",
             return_value=world_size)
 
-        result = _gather_along_last_dim(test_tensor, None)
+        result = _gather_along_last_dim(test_tensor, mocker.MagicMock() )
 
         assert result.shape == expected
 
@@ -88,18 +89,18 @@ def test_gather_along_last_dim(self, test_tensor, expected,
         ((40, 10), (10, 10)),
     ])
     def test_reduce_scatter_along_first_dim(self, input_shape,
-                                            expected_shape):
+                                            expected_shape, mocker: MockerFixture):
         input_tensor = torch.randn(*input_shape)
-        result = _reduce_scatter_along_first_dim(input_tensor, None)
+        result = _reduce_scatter_along_first_dim(input_tensor, mocker.MagicMock() )
         assert result.shape == expected_shape
 
     @pytest.mark.parametrize("input_shape,expected_shape", [
         ((8, 16, 32), (8, 16, 8)),
     ])
     def test_reduce_scatter_along_last_dim(self, input_shape,
-                                           expected_shape):
+                                           expected_shape, mocker: MockerFixture):
         input_tensor = torch.randn(*input_shape)
-        result = _reduce_scatter_along_last_dim(input_tensor, None)
+        result = _reduce_scatter_along_last_dim(input_tensor, mocker.MagicMock() )
         assert result.shape == expected_shape
 
     @pytest.mark.parametrize("func,input_shape,expected_shape", [
@@ -111,14 +112,14 @@ def test_reduce_scatter_along_last_dim(self, input_shape,
         ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
     ])
     def test_wrapper_functions(self, func, input_shape,
-                               expected_shape):
+                               expected_shape, mocker: MockerFixture):
         """test wrapper funcs"""
         mod = importlib.import_module(
             'vllm_ascend.distributed.tensor_parallel')
         globals = mod.__dict__
         test_func = globals[func]
         input_tensor = torch.randn(*input_shape)
-        result = test_func(input_tensor, None)
+        result = test_func(input_tensor, mocker.MagicMock() )
         assert result.shape == expected_shape
 
     @pytest.mark.parametrize(
@@ -126,9 +127,9 @@ def test_wrapper_functions(self, func, input_shape,
         [
             ((8, 16), (32, 4)),  # [num_tokens/TP, H] -> [num_tokens, H/TP]
         ])
-    def test_all_to_all_sp2hp(self, input_shape, output_shape):
+    def test_all_to_all_sp2hp(self, input_shape, output_shape, mocker: MockerFixture):
         input_tensor = torch.randn(*input_shape)
-        result = all_to_all_sp2hp(input_tensor, None)
+        result = all_to_all_sp2hp(input_tensor, mocker.MagicMock() )
         assert result.shape == output_shape
 
     @pytest.mark.parametrize(
@@ -136,7 +137,7 @@ def test_all_to_all_sp2hp(self, input_shape, output_shape):
         [
             ((32, 4), (8, 16)),  # [num_tokens, H/TP] -> [num_tokens/TP, H]
         ])
-    def test_all_to_all_hp2sp(self, input_shape, output_shape):
+    def test_all_to_all_hp2sp(self, input_shape, output_shape, mocker: MockerFixture):
         input_tensor = torch.randn(*input_shape)
-        result = all_to_all_hp2sp(input_tensor, None)
+        result = all_to_all_hp2sp(input_tensor, mocker.MagicMock() )
         assert result.shape == output_shape

From 442e26f7faa0f4390942c1adec0161ac9c88a196 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 10:41:51 +0800
Subject: [PATCH 41/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_distributed_tensor_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
index 4b035fd9869..e902e3874dc 100644
--- a/tests/ut/test_distributed_tensor_parallel.py
+++ b/tests/ut/test_distributed_tensor_parallel.py
@@ -18,8 +18,8 @@
 import importlib
 
 import pytest
-from pytest_mock import MockerFixture
 import torch
+from pytest_mock import MockerFixture
 
 from tests.ut.base import PytestBase
 from vllm_ascend.distributed.tensor_parallel import (

From 9130e589bb12ab8cd4f79eb5b5315ca361787f0e Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 11:05:47 +0800
Subject: [PATCH 42/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_distributed_tensor_parallel.py | 88 ++++++++++----------
 1 file changed, 42 insertions(+), 46 deletions(-)

diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
index e902e3874dc..7a74ea1af67 100644
--- a/tests/ut/test_distributed_tensor_parallel.py
+++ b/tests/ut/test_distributed_tensor_parallel.py
@@ -32,29 +32,21 @@ class TestDistributedCommunication(PytestBase):
 
     @pytest.fixture(autouse=True)
     def context(self, mocker: MockerFixture):
-        mocker.patch(
-            "torch.npu.current_device",
-            return_value="cpu")
-        mocker.patch(
-            "torch.distributed.get_world_size",
-            return_value=4)
-
-        mocker.patch(
-            "torch.distributed.get_rank",
-            return_value=0)
-
-    @pytest.mark.parametrize("world_size, test_tensor, expected", [
-        (1, torch.randn(8, 16), (8, 16)),
-        (4, torch.randn(8, 16), (32, 16))
-    ])
-    def test_gather_along_first_dim(self, test_tensor, expected,
-                                    world_size, mocker: MockerFixture):
+        mocker.patch("torch.npu.current_device", return_value="cpu")
+        mocker.patch("torch.distributed.get_world_size", return_value=4)
+
+        mocker.patch("torch.distributed.get_rank", return_value=0)
+
+    @pytest.mark.parametrize("world_size, test_tensor, expected",
+                             [(1, torch.randn(8, 16), (8, 16)),
+                              (4, torch.randn(8, 16), (32, 16))])
+    def test_gather_along_first_dim(self, test_tensor, expected, world_size,
+                                    mocker: MockerFixture):
         """test _gather_along_first_dim"""
-        mocker.patch(
-            "torch.distributed.get_world_size",
-            return_value=world_size)
+        mocker.patch("torch.distributed.get_world_size",
+                     return_value=world_size)
 
-        result = _gather_along_first_dim(test_tensor, mocker.MagicMock() )
+        result = _gather_along_first_dim(test_tensor, mocker.MagicMock())
 
         assert result.shape == expected
 
@@ -62,25 +54,25 @@ def test_gather_along_first_dim(self, test_tensor, expected,
         (torch.randn(8, 16), [5, 10, 15, 2], (32, 16)),
     ])
     def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
-                                                  output_split_sizes, mocker: MockerFixture):
+                                                  output_split_sizes,
+                                                  mocker: MockerFixture):
         """test _gather_along_first_dim"""
 
-        result = _gather_along_first_dim(test_tensor, mocker.MagicMock() , output_split_sizes)
+        result = _gather_along_first_dim(test_tensor, mocker.MagicMock(),
+                                         output_split_sizes)
 
         assert result.shape == expected
 
-    @pytest.mark.parametrize("world_size, test_tensor, expected", [
-        (1, torch.randn(8, 16, 32), (8, 16, 32)),
-        (4, torch.randn(8, 16, 32), (8, 16, 32 * 4))
-    ])
-    def test_gather_along_last_dim(self, test_tensor, expected,
-                                   world_size, mocker: MockerFixture):
+    @pytest.mark.parametrize("world_size, test_tensor, expected",
+                             [(1, torch.randn(8, 16, 32), (8, 16, 32)),
+                              (4, torch.randn(8, 16, 32), (8, 16, 32 * 4))])
+    def test_gather_along_last_dim(self, test_tensor, expected, world_size,
+                                   mocker: MockerFixture):
         """test _gather_along_last_dim"""
-        mocker.patch(
-            "torch.distributed.get_world_size",
-            return_value=world_size)
+        mocker.patch("torch.distributed.get_world_size",
+                     return_value=world_size)
 
-        result = _gather_along_last_dim(test_tensor, mocker.MagicMock() )
+        result = _gather_along_last_dim(test_tensor, mocker.MagicMock())
 
         assert result.shape == expected
 
@@ -88,19 +80,21 @@ def test_gather_along_last_dim(self, test_tensor, expected,
         ((32, 16), (8, 16)),
         ((40, 10), (10, 10)),
     ])
-    def test_reduce_scatter_along_first_dim(self, input_shape,
-                                            expected_shape, mocker: MockerFixture):
+    def test_reduce_scatter_along_first_dim(self, input_shape, expected_shape,
+                                            mocker: MockerFixture):
         input_tensor = torch.randn(*input_shape)
-        result = _reduce_scatter_along_first_dim(input_tensor, mocker.MagicMock() )
+        result = _reduce_scatter_along_first_dim(input_tensor,
+                                                 mocker.MagicMock())
         assert result.shape == expected_shape
 
     @pytest.mark.parametrize("input_shape,expected_shape", [
         ((8, 16, 32), (8, 16, 8)),
     ])
-    def test_reduce_scatter_along_last_dim(self, input_shape,
-                                           expected_shape, mocker: MockerFixture):
+    def test_reduce_scatter_along_last_dim(self, input_shape, expected_shape,
+                                           mocker: MockerFixture):
         input_tensor = torch.randn(*input_shape)
-        result = _reduce_scatter_along_last_dim(input_tensor, mocker.MagicMock() )
+        result = _reduce_scatter_along_last_dim(input_tensor,
+                                                mocker.MagicMock())
         assert result.shape == expected_shape
 
     @pytest.mark.parametrize("func,input_shape,expected_shape", [
@@ -111,15 +105,15 @@ def test_reduce_scatter_along_last_dim(self, input_shape,
          (8, 16, 8)),
         ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
     ])
-    def test_wrapper_functions(self, func, input_shape,
-                               expected_shape, mocker: MockerFixture):
+    def test_wrapper_functions(self, func, input_shape, expected_shape,
+                               mocker: MockerFixture):
         """test wrapper funcs"""
         mod = importlib.import_module(
             'vllm_ascend.distributed.tensor_parallel')
         globals = mod.__dict__
         test_func = globals[func]
         input_tensor = torch.randn(*input_shape)
-        result = test_func(input_tensor, mocker.MagicMock() )
+        result = test_func(input_tensor, mocker.MagicMock())
         assert result.shape == expected_shape
 
     @pytest.mark.parametrize(
@@ -127,9 +121,10 @@ def test_wrapper_functions(self, func, input_shape,
         [
             ((8, 16), (32, 4)),  # [num_tokens/TP, H] -> [num_tokens, H/TP]
         ])
-    def test_all_to_all_sp2hp(self, input_shape, output_shape, mocker: MockerFixture):
+    def test_all_to_all_sp2hp(self, input_shape, output_shape,
+                              mocker: MockerFixture):
         input_tensor = torch.randn(*input_shape)
-        result = all_to_all_sp2hp(input_tensor, mocker.MagicMock() )
+        result = all_to_all_sp2hp(input_tensor, mocker.MagicMock())
         assert result.shape == output_shape
 
     @pytest.mark.parametrize(
@@ -137,7 +132,8 @@ def test_all_to_all_sp2hp(self, input_shape, output_shape, mocker: MockerFixture
         [
             ((32, 4), (8, 16)),  # [num_tokens, H/TP] -> [num_tokens/TP, H]
         ])
-    def test_all_to_all_hp2sp(self, input_shape, output_shape, mocker: MockerFixture):
+    def test_all_to_all_hp2sp(self, input_shape, output_shape,
+                              mocker: MockerFixture):
         input_tensor = torch.randn(*input_shape)
-        result = all_to_all_hp2sp(input_tensor, mocker.MagicMock() )
+        result = all_to_all_hp2sp(input_tensor, mocker.MagicMock())
         assert result.shape == output_shape

From 544c00757fd35ec8070cd8dd89b39507b999bed8 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 11:57:05 +0800
Subject: [PATCH 43/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/envs.py          | 6 +++---
 vllm_ascend/ops/fused_moe.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
index ed169c327f3..704a41c26ea 100644
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -107,11 +107,11 @@
     "VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE":
     lambda: bool(int(os.getenv("VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE", '0'))
                  ),
-    # VLLM_ASCEND_MOE_ALL2ALL_BUFFER:
+    # MOE_ALL2ALL_BUFFER:
     #   0: default, normal init.
     #   1: enable moe_all2all_buffer.
-    "VLLM_ASCEND_MOE_ALL2ALL_BUFFER":
-    lambda: bool(int(os.getenv("VLLM_ASCEND_MOE_ALL2ALL_BUFFER", '0'))),
+    "MOE_ALL2ALL_BUFFER":
+    lambda: bool(int(os.getenv("MOE_ALL2ALL_BUFFER", '0'))),
     # Some models are optimized by vllm ascend. While in some case, e.g. rlhf
     # training, the optimized model may not be suitable. In this case, set this
     # value to False to disable the optimized model.
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index c819eeb3391..7bd8d747930 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -56,7 +56,7 @@
                                get_ascend_soc_version,
                                get_rm_router_logits_state, is_310p)
 
-VLLM_ASCEND_MOE_ALL2ALL_BUFFER: bool = envs_ascend.VLLM_ASCEND_MOE_ALL2ALL_BUFFER
+MOE_ALL2ALL_BUFFER: bool = envs_ascend.MOE_ALL2ALL_BUFFER
 SELECT_GATING_TOPK_SOTFMAX_EXPERTS: bool = envs_ascend.SELECT_GATING_TOPK_SOTFMAX_EXPERTS
 
 
@@ -1154,7 +1154,7 @@ def apply(
                                  topk_ids=topk_ids,
                                  top_k=top_k,
                                  expert_map=expert_map)
-        elif VLLM_ASCEND_MOE_ALL2ALL_BUFFER:
+        elif MOE_ALL2ALL_BUFFER:
             return fused_experts_with_all2all_buffer(
                 hidden_states=x,
                 w1=layer.w13_weight,

From 6649ad6e7eaa8e0e4eaab22cc65fba73422d1da7 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 13:08:46 +0800
Subject: [PATCH 44/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/test_token_dispatcher.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/ut/test_token_dispatcher.py b/tests/ut/test_token_dispatcher.py
index ff1759d6b0b..3a42b93c42d 100644
--- a/tests/ut/test_token_dispatcher.py
+++ b/tests/ut/test_token_dispatcher.py
@@ -52,6 +52,8 @@ def dispatcher(self, config, mocker: MockerFixture):
         mocker.patch(
             "vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ep_group",
             return_value=self.mock_ep_group(mocker))
+        mocker.patch("torch.npu.current_device", return_value="cpu")
+        mocker.patch("torch.npu.Stream", return_value=mocker.MagicMock)
         return MoEAlltoAllSeqOverLapDispatcher(config)
 
     def test_initialization(self, dispatcher, config):

From f71847a09ed9e77da01e68b5ed3a891e5757a6d3 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 20:10:58 +0800
Subject: [PATCH 45/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 .../test_offline_inference_distributed.py     |   7 +-
 vllm_ascend/envs.py                           |   2 +-
 vllm_ascend/models/qwen3_moe.py               |  43 +++++-
 vllm_ascend/ops/comm_utils.py                 |  65 ---------
 .../ops/moe_dispatcher/token_dispatcher.py    | 125 ------------------
 5 files changed, 46 insertions(+), 196 deletions(-)

diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index 92629cdca51..15abc2cb466 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -142,16 +142,15 @@ def test_models_distributed_topk() -> None:
         "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
         "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
     ]
+    dtype = "half"
     sampling_params = SamplingParams(max_tokens=5,
                                      temperature=0.0,
                                      top_k=50,
                                      top_p=0.9)
 
     with VllmRunner(
-            snapshot_download("vllm-ascend/pangu-pro-moe-pruing"),
-            max_model_len=8192,
-            enforce_eager=True,
-            dtype="auto",
+            "deepseek-ai/DeepSeek-V2-Lite",
+            dtype=dtype,
             tensor_parallel_size=2,
             distributed_executor_backend="mp",
     ) as vllm_model:
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
index 704a41c26ea..aea5275cf1b 100644
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -159,7 +159,7 @@
     # this feature is supported in A2, and eager mode will get better performance.
     "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE":
     lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", '0'))),
-    # VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ:
+    # Whether to enable the alltoall_seq flag, this provides a basic framework on the basis of alltoall for easy expansion.
     #   0: default, normal init.
     #   1: enable moe all2all seq.
     "VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ":
diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index 485e5ca92fc..0f3b9a83897 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -15,11 +15,48 @@
 # limitations under the License.
 # Adapted from vllm/model_executor/models/qwen3_moe.py
 # This file is a part of the vllm-ascend project.
+from typing import Optional
 
 import vllm.model_executor.models.qwen3_moe as qwen3
+from compressed_tensors import QuantizationConfig
+from transformers import PretrainedConfig, CacheConfig
 from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
+from vllm.model_executor.models.utils import make_layers, maybe_prefix
 
 from vllm_ascend.ops.fused_moe import AscendSparseMoeBlock
+from vllm_ascend.platform import VllmConfig
+
+
+class CustomQwen3MoeDecoderLayer(qwen3.Qwen3MoeDecoderLayer):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, cache_config, quant_config, prefix)
+
+        self.mlp = AscendSparseMoeBlock(config=config,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.mlp")
+
+
+class CustomQwen3MoeModel(qwen3.Qwen3MoeModel):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: CustomQwen3MoeDecoderLayer(config=config,
+                                                cache_config=cache_config,
+                                                quant_config=quant_config,
+                                                prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
 
 
 class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
@@ -36,4 +73,8 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
         "experts":
         ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
     }
-    qwen3.Qwen3MoeSparseMoeBlock = AscendSparseMoeBlock
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.model = CustomQwen3MoeModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
diff --git a/vllm_ascend/ops/comm_utils.py b/vllm_ascend/ops/comm_utils.py
index 6c43773308b..e893049ed87 100644
--- a/vllm_ascend/ops/comm_utils.py
+++ b/vllm_ascend/ops/comm_utils.py
@@ -21,71 +21,6 @@
 COMM_STREAM = None
 
 
-def async_all_gather(input_,
-                     group,
-                     event=None,
-                     is_use_get_global_memory_buffer=False):
-    world_size = torch.distributed.get_world_size(group)
-    dim_size = list(input_.size())
-    new_dim_size = dim_size[0] * world_size
-    dim_size[0] = new_dim_size
-
-    ag_out = torch.empty(dim_size,
-                         dtype=input_.dtype,
-                         device=torch.npu.current_device())
-    if event:
-        # multi stream wait event
-        global COMM_STREAM
-        if COMM_STREAM is None:
-            COMM_STREAM = torch_npu.npu.Stream(
-                device=torch.npu.current_device())
-        with torch_npu.npu.stream(COMM_STREAM):
-            event.wait()
-            handle = torch.distributed._all_gather_base(ag_out,
-                                                        input_.contiguous(),
-                                                        group=group,
-                                                        async_op=True)
-    else:
-        handle = torch.distributed._all_gather_base(ag_out,
-                                                    input_.contiguous(),
-                                                    group=group,
-                                                    async_op=True)
-    return input_, ag_out, handle
-
-
-def async_reduce_scatter(input_,
-                         group,
-                         event=None,
-                         stream=None,
-                         is_use_get_global_memory_buffer=False):
-    world_size = dist.get_world_size(group)
-    dim_size = list(input_.size())
-    dim_size[0] = dim_size[0] // world_size
-
-    rs_out = torch.empty(dim_size,
-                         dtype=input_.dtype,
-                         device=torch.npu.current_device())
-    if event or stream:
-        # multi stream wait event
-        global COMM_STREAM
-        if COMM_STREAM is None:
-            COMM_STREAM = torch_npu.npu.Stream(
-                device=torch.npu.current_device())
-        with torch_npu.npu.stream(COMM_STREAM):
-            if event:
-                event.wait()
-            if stream:
-                torch.npu.current_stream().wait_stream(stream)
-            handle = torch.distributed.reduce_scatter_tensor(
-                rs_out, input_.contiguous(), group=group, async_op=True)
-    else:
-        handle = torch.distributed.reduce_scatter_tensor(rs_out,
-                                                         input_.contiguous(),
-                                                         group=group,
-                                                         async_op=True)
-    return input_, rs_out, handle
-
-
 def async_all_to_all(input_,
                      output_split_sizes,
                      input_split_sizes,
diff --git a/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py b/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py
index 91118e296de..402e8fb93ad 100644
--- a/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py
+++ b/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py
@@ -382,131 +382,6 @@ def alltoall_token_permutation2(global_input_tokens):
 
         return share_experts_output, global_input_tokens, tokens_per_expert
 
-    def preprocess_and_permtute1(self,
-                                 hidden_states: torch.Tensor,
-                                 probs: torch.Tensor,
-                                 routing_map: torch.Tensor,
-                                 shared_experts=None,
-                                 shared_experts_input: torch.Tensor = None):
-        self.hidden_shape = hidden_states.shape
-        self.probs = probs
-        self.top_indices = routing_map
-        assert probs.dim() == 2, "Expected 2D tensor for probs"
-        assert routing_map.dim() == 2, "Expected 2D tensor for routing map"
-        assert self.hidden_shape is not None
-
-        hidden_states = hidden_states.view(-1, self.hidden_shape[-1])
-        tokens_per_expert = self.preprocess(routing_map, with_sync=False)
-        self.hidden_shape_before_permute = hidden_states.shape
-
-        if self.device_sync_point == "before_permutation_1":
-            torch.npu.current_stream().synchronize()
-
-        event = torch.npu.current_stream().record_event()
-        self.perm1_finish_event = torch.npu.Event()
-        with torch.npu.stream(self.overlap_stream):
-            assert self.overlap_stream is not None
-            self.overlap_stream.wait_event(event)
-
-            if shared_experts is not None:
-                shared_output = shared_experts(shared_experts_input)
-                self.cached_shared_expert_output = shared_output
-
-            hidden_states, self.reversed_local_input_permutation_mapping = torch_npu.npu_moe_token_permute(
-                tokens=hidden_states,
-                indices=self.top_indices,
-                num_out_tokens=self.num_out_tokens,
-            )
-
-            self.perm1_finish_event.record()
-
-        # repeat interleve will launch a sync on current_stream.
-        if self.num_local_experts > 1:
-            self.device_sync_point = "no_sync"
-            if self.num_global_tokens_per_local_expert is None:
-                raise ValueError(
-                    "num_global_tokens_per_local_expert must be set before operations."
-                )
-            self.global_input_tokens_local_experts_indices = torch.repeat_interleave(
-                self.expert_ids_per_ep_rank,
-                self.num_global_tokens_per_local_expert.ravel())
-
-        self.cached_permutated_local_input_tokens = hidden_states
-        self.tokens_per_expert = tokens_per_expert
-
-    def dispatch_alltoall(self):
-        ep_group = self.ep_group
-
-        # Perform expert parallel AlltoAll communication
-        if self.device_sync_point == "before_ep_alltoall":
-            torch.npu.current_stream().synchronize()
-
-        torch.npu.current_stream().wait_event(self.perm1_finish_event)
-        self.perm1_finish_event = None
-        _, self.cached_global_input_tokens, permute1_ep_all_to_all_handle = async_all_to_all(
-            self.cached_permutated_local_input_tokens,
-            self.output_splits,
-            self.input_splits,
-            ep_group,
-        )
-        permute1_ep_all_to_all_handle.wait()
-        if self.cached_permutated_local_input_tokens is None:
-            raise ValueError(
-                "cached_permutated_local_input_tokens must be set before operations."
-            )
-        self.cached_permutated_local_input_tokens.untyped_storage().resize_(0)
-        self.cached_permutated_local_input_tokens = None
-
-    def permute2(self):
-        global_input_tokens = self.cached_global_input_tokens
-        if self.num_local_experts > 1:
-            global_input_tokens, self.reversed_global_input_permutation_mapping = torch_npu.npu_moe_token_permute(
-                self.cached_global_input_tokens,
-                self.global_input_tokens_local_experts_indices)
-            assert self.cached_global_input_tokens is not None
-            self.cached_global_input_tokens.untyped_storage().resize_(0)
-            self.cached_global_input_tokens = None
-
-        return global_input_tokens, self.tokens_per_expert
-
-    def unpermute1(self, hidden_states: torch.Tensor):
-        # Unpermutation 2: expert output to AlltoAll input
-        if hidden_states.shape[0] > 0 and self.num_local_experts > 1:
-            hidden_states = torch_npu.npu_moe_token_unpermute(
-                hidden_states, self.reversed_global_input_permutation_mapping)
-        self.cached_global_output_tokens = hidden_states
-        self.reversed_global_input_permutation_mapping = None
-
-    def combine_alltoall(self):
-        ep_group = self.ep_group
-        # Perform expert parallel AlltoAll communication
-        # hidden_states: [SEQL, H] -> [SEQL, H/TP]
-        _, self.cached_local_output_tokens, handle = async_all_to_all(
-            self.cached_global_output_tokens, self.input_splits,
-            self.output_splits, ep_group)
-        handle.wait()
-        self.cached_global_output_tokens.untyped_storage().resize_(0)
-        self.cached_global_output_tokens = None
-        self.input_splits = None
-        self.output_splits = None
-
-    def unpermute2(self):
-        output = torch_npu.npu_moe_token_unpermute(
-            permuted_tokens=self.cached_local_output_tokens,
-            sorted_indices=self.reversed_local_input_permutation_mapping.to(
-                torch.int32),
-            probs=self.probs,
-            restore_shape=self.hidden_shape_before_permute)
-
-        output = output.view(self.hidden_shape)
-
-        self.probs = None
-        self.reversed_local_input_permutation_mapping = None
-        self.cached_local_output_tokens.untyped_storage().resize_(0)
-        self.cached_local_output_tokens = None
-
-        return output
-
     def token_unpermutation(self,
                             hidden_states: torch.Tensor,
                             bias: torch.Tensor = None):

From 402c006b3c0f813d9653d10354d29cff10e01c1f Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 20:41:12 +0800
Subject: [PATCH 46/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/qwen3_moe.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index 0f3b9a83897..8ae33bb52c0 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -19,9 +19,12 @@
 
 import vllm.model_executor.models.qwen3_moe as qwen3
 from compressed_tensors import QuantizationConfig
+from torch import nn
 from transformers import PretrainedConfig, CacheConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
-from vllm.model_executor.models.utils import make_layers, maybe_prefix
+from vllm.model_executor.models.utils import make_layers, maybe_prefix, make_empty_intermediate_tensors_factory
 
 from vllm_ascend.ops.fused_moe import AscendSparseMoeBlock
 from vllm_ascend.platform import VllmConfig
@@ -42,13 +45,20 @@ def __init__(
                                               prefix=f"{prefix}.mlp")
 
 
-class CustomQwen3MoeModel(qwen3.Qwen3MoeModel):
+class CustomQwen3MoeModel(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=f"{prefix}.embed_tokens")
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: CustomQwen3MoeDecoderLayer(config=config,
@@ -57,6 +67,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 prefix=prefix),
             prefix=f"{prefix}.layers",
         )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
 
 
 class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):

From d3c188dabcd2ce7f613cddbe20e3052ef9c251b9 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 20:42:13 +0800
Subject: [PATCH 47/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/qwen3_moe.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index 8ae33bb52c0..2e24a794fd0 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -46,7 +46,9 @@ def __init__(
 
 
 class CustomQwen3MoeModel(nn.Module):
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super.__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config

From 54dbf76a8866b72267b0c44b66590224213c8032 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 20:53:15 +0800
Subject: [PATCH 48/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/qwen3_moe.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index 2e24a794fd0..58533eab36c 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -45,11 +45,9 @@ def __init__(
                                               prefix=f"{prefix}.mlp")
 
 
-class CustomQwen3MoeModel(nn.Module):
-
+class CustomQwen3MoeModel(qwen3.Qwen3MoeModel):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super.__init__()
-
+        nn.Module.__init__(self)
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config

From cfff17a54571bcaa754dfdec3df44e91ec734a56 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 20:58:19 +0800
Subject: [PATCH 49/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/qwen3_moe.py | 44 ++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index 58533eab36c..3e28b066371 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -24,7 +24,8 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
-from vllm.model_executor.models.utils import make_layers, maybe_prefix, make_empty_intermediate_tensors_factory
+from vllm.model_executor.models.utils import make_layers, maybe_prefix, make_empty_intermediate_tensors_factory, \
+    extract_layer_index
 
 from vllm_ascend.ops.fused_moe import AscendSparseMoeBlock
 from vllm_ascend.platform import VllmConfig
@@ -38,11 +39,48 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
-        super().__init__(config, cache_config, quant_config, prefix)
 
-        self.mlp = AscendSparseMoeBlock(config=config,
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = qwen3.Qwen3MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        # `mlp_only_layers` in the config.
+        layer_idx = extract_layer_index(prefix)
+        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
+                           config.mlp_only_layers)
+        if (layer_idx not in mlp_only_layers) and (
+                config.num_experts > 0 and
+                (layer_idx + 1) % config.decoder_sparse_step == 0):
+            self.mlp = AscendSparseMoeBlock(config=config,
                                               quant_config=quant_config,
                                               prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = qwen3.Qwen3MoeMLP(hidden_size=config.hidden_size,
+                                         intermediate_size=config.intermediate_size,
+                                         hidden_act=config.hidden_act,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.mlp")
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
 
 
 class CustomQwen3MoeModel(qwen3.Qwen3MoeModel):

From 1336eb3784c9a5a4d847ae7581ca73ff0ef44efa Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 21:05:21 +0800
Subject: [PATCH 50/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/qwen3_moe.py | 39 +++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index 3e28b066371..ba163b88859 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -17,21 +17,24 @@
 # This file is a part of the vllm-ascend project.
 from typing import Optional
 
-import vllm.model_executor.models.qwen3_moe as qwen3
-from compressed_tensors import QuantizationConfig
 from torch import nn
-from transformers import PretrainedConfig, CacheConfig
+from transformers import PretrainedConfig
+
+from vllm.config import CacheConfig, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
-from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
-from vllm.model_executor.models.utils import make_layers, maybe_prefix, make_empty_intermediate_tensors_factory, \
-    extract_layer_index
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.models.qwen3_moe import Qwen3MoeMLP, Qwen3MoeModel, Qwen3MoeForCausalLM, Qwen3MoeDecoderLayer
+from vllm.model_executor.models.utils import extract_layer_index, make_empty_intermediate_tensors_factory, maybe_prefix, \
+    make_layers
 
 from vllm_ascend.ops.fused_moe import AscendSparseMoeBlock
 from vllm_ascend.platform import VllmConfig
 
 
-class CustomQwen3MoeDecoderLayer(qwen3.Qwen3MoeDecoderLayer):
+class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
     def __init__(
         self,
         config: PretrainedConfig,
@@ -46,7 +49,7 @@ def __init__(
         rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings",
                                           8192)
-        self.self_attn = qwen3.Qwen3MoeAttention(
+        self.self_attn = Qwen3MoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
@@ -72,7 +75,7 @@ def __init__(
                                               quant_config=quant_config,
                                               prefix=f"{prefix}.mlp")
         else:
-            self.mlp = qwen3.Qwen3MoeMLP(hidden_size=config.hidden_size,
+            self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
                                          intermediate_size=config.intermediate_size,
                                          hidden_act=config.hidden_act,
                                          quant_config=quant_config,
@@ -83,7 +86,7 @@ def __init__(
                                                 eps=config.rms_norm_eps)
 
 
-class CustomQwen3MoeModel(qwen3.Qwen3MoeModel):
+class CustomQwen3MoeModel(Qwen3MoeModel):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)
         config = vllm_config.model_config.hf_config
@@ -127,6 +130,18 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
         self.model = CustomQwen3MoeModel(vllm_config=vllm_config,
                                    prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)

From a35f81293ade387c5d755bd9245e09088df2f719 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 21:09:00 +0800
Subject: [PATCH 51/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/qwen3_moe.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index ba163b88859..7f198f4df43 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -26,7 +26,8 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.models.qwen3_moe import Qwen3MoeMLP, Qwen3MoeModel, Qwen3MoeForCausalLM, Qwen3MoeDecoderLayer
+from vllm.model_executor.models.qwen3_moe import Qwen3MoeMLP, Qwen3MoeModel, Qwen3MoeForCausalLM, Qwen3MoeDecoderLayer, \
+    Qwen3MoeAttention
 from vllm.model_executor.models.utils import extract_layer_index, make_empty_intermediate_tensors_factory, maybe_prefix, \
     make_layers
 
@@ -130,7 +131,7 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
+        nn.Module.__init__(self)
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config

From f8aa32b04b7acdaaea9f2ef408bd5d4cf2bf47f3 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 21:10:52 +0800
Subject: [PATCH 52/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/qwen3_moe.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index 7f198f4df43..756bcdb275a 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -19,6 +19,7 @@
 
 from torch import nn
 from transformers import PretrainedConfig
+from vllm.compilation.decorators import support_torch_compile
 
 from vllm.config import CacheConfig, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -86,7 +87,7 @@ def __init__(
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                 eps=config.rms_norm_eps)
 
-
+@support_torch_compile
 class CustomQwen3MoeModel(Qwen3MoeModel):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)

From 01ebd07a0572637e6706d3d66fa3b0c07e86a1c6 Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Thu, 31 Jul 2025 21:11:51 +0800
Subject: [PATCH 53/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/qwen3_moe.py | 40 +++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index 756bcdb275a..b0b0bdb17c1 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -20,23 +20,26 @@
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.compilation.decorators import support_torch_compile
-
 from vllm.config import CacheConfig, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.models.qwen3_moe import Qwen3MoeMLP, Qwen3MoeModel, Qwen3MoeForCausalLM, Qwen3MoeDecoderLayer, \
-    Qwen3MoeAttention
-from vllm.model_executor.models.utils import extract_layer_index, make_empty_intermediate_tensors_factory, maybe_prefix, \
-    make_layers
+from vllm.model_executor.models.qwen3_moe import (Qwen3MoeAttention,
+                                                  Qwen3MoeDecoderLayer,
+                                                  Qwen3MoeForCausalLM,
+                                                  Qwen3MoeMLP, Qwen3MoeModel)
+from vllm.model_executor.models.utils import (
+    extract_layer_index, make_empty_intermediate_tensors_factory, make_layers,
+    maybe_prefix)
 
 from vllm_ascend.ops.fused_moe import AscendSparseMoeBlock
 from vllm_ascend.platform import VllmConfig
 
 
 class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
+
     def __init__(
         self,
         config: PretrainedConfig,
@@ -72,23 +75,25 @@ def __init__(
                            config.mlp_only_layers)
         if (layer_idx not in mlp_only_layers) and (
                 config.num_experts > 0 and
-                (layer_idx + 1) % config.decoder_sparse_step == 0):
+            (layer_idx + 1) % config.decoder_sparse_step == 0):
             self.mlp = AscendSparseMoeBlock(config=config,
-                                              quant_config=quant_config,
-                                              prefix=f"{prefix}.mlp")
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.mlp")
         else:
             self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
-                                         intermediate_size=config.intermediate_size,
-                                         hidden_act=config.hidden_act,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.mlp")
+                                   intermediate_size=config.intermediate_size,
+                                   hidden_act=config.hidden_act,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.mlp")
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                 eps=config.rms_norm_eps)
 
+
 @support_torch_compile
 class CustomQwen3MoeModel(Qwen3MoeModel):
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)
         config = vllm_config.model_config.hf_config
@@ -104,10 +109,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=f"{prefix}.embed_tokens")
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: CustomQwen3MoeDecoderLayer(config=config,
-                                                cache_config=cache_config,
-                                                quant_config=quant_config,
-                                                prefix=prefix),
+            lambda prefix: CustomQwen3MoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -138,7 +144,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.quant_config = quant_config
         self.model = CustomQwen3MoeModel(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
+                                         prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)

From 57b5378df02d30d4d6cd3b7601cbfe236b2370de Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Fri, 1 Aug 2025 09:18:05 +0800
Subject: [PATCH 54/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 vllm_ascend/models/qwen3_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index b0b0bdb17c1..0c5ad39c173 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -20,7 +20,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig

From aa26b192ff27a29504f403f155dd84ca9a4f1bec Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Fri, 1 Aug 2025 09:21:00 +0800
Subject: [PATCH 55/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 requirements-dev.txt | 1 +
 requirements.txt     | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 4f36cd70d9b..5e31485561b 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -17,3 +17,4 @@ ray>=2.47.1
 protobuf==4.25.6
 librosa 
 soundfile
+pytest_mock
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index effdf838b52..c2b2a3175eb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,5 +27,3 @@ numba
 --pre
 --extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi
 torch-npu==2.5.1.post1.dev20250619
-
-pytest_mock

From c4993df4228075d2abcc5558d2fc88d513960e3f Mon Sep 17 00:00:00 2001
From: weijinqian_v1 <weijinqian@huawei.com>
Date: Fri, 1 Aug 2025 19:25:08 +0800
Subject: [PATCH 56/56] [v0.9.1][Feature] add moe alltoallv.

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
---
 .github/workflows/vllm_ascend_test.yaml       |   1 +
 .../test_distributed_tensor_parallel.py       | 278 +++++++++---------
 tests/ut/{ => ops}/test_token_dispatcher.py   |   0
 3 files changed, 140 insertions(+), 139 deletions(-)
 rename tests/ut/{ => distributed}/test_distributed_tensor_parallel.py (97%)
 rename tests/ut/{ => ops}/test_token_dispatcher.py (100%)

diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index 580559c9483..afa05677ee0 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -277,6 +277,7 @@ jobs:
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv
           pytest -sv tests/e2e/multicard/test_data_parallel.py
           pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
             --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/distributed/test_distributed_tensor_parallel.py
similarity index 97%
rename from tests/ut/test_distributed_tensor_parallel.py
rename to tests/ut/distributed/test_distributed_tensor_parallel.py
index 7a74ea1af67..48a88fa1f61 100644
--- a/tests/ut/test_distributed_tensor_parallel.py
+++ b/tests/ut/distributed/test_distributed_tensor_parallel.py
@@ -1,139 +1,139 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-
-import importlib
-
-import pytest
-import torch
-from pytest_mock import MockerFixture
-
-from tests.ut.base import PytestBase
-from vllm_ascend.distributed.tensor_parallel import (
-    _gather_along_first_dim, _gather_along_last_dim,
-    _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
-    all_to_all_hp2sp, all_to_all_sp2hp)
-
-
-class TestDistributedCommunication(PytestBase):
-
-    @pytest.fixture(autouse=True)
-    def context(self, mocker: MockerFixture):
-        mocker.patch("torch.npu.current_device", return_value="cpu")
-        mocker.patch("torch.distributed.get_world_size", return_value=4)
-
-        mocker.patch("torch.distributed.get_rank", return_value=0)
-
-    @pytest.mark.parametrize("world_size, test_tensor, expected",
-                             [(1, torch.randn(8, 16), (8, 16)),
-                              (4, torch.randn(8, 16), (32, 16))])
-    def test_gather_along_first_dim(self, test_tensor, expected, world_size,
-                                    mocker: MockerFixture):
-        """test _gather_along_first_dim"""
-        mocker.patch("torch.distributed.get_world_size",
-                     return_value=world_size)
-
-        result = _gather_along_first_dim(test_tensor, mocker.MagicMock())
-
-        assert result.shape == expected
-
-    @pytest.mark.parametrize("test_tensor, output_split_sizes, expected", [
-        (torch.randn(8, 16), [5, 10, 15, 2], (32, 16)),
-    ])
-    def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
-                                                  output_split_sizes,
-                                                  mocker: MockerFixture):
-        """test _gather_along_first_dim"""
-
-        result = _gather_along_first_dim(test_tensor, mocker.MagicMock(),
-                                         output_split_sizes)
-
-        assert result.shape == expected
-
-    @pytest.mark.parametrize("world_size, test_tensor, expected",
-                             [(1, torch.randn(8, 16, 32), (8, 16, 32)),
-                              (4, torch.randn(8, 16, 32), (8, 16, 32 * 4))])
-    def test_gather_along_last_dim(self, test_tensor, expected, world_size,
-                                   mocker: MockerFixture):
-        """test _gather_along_last_dim"""
-        mocker.patch("torch.distributed.get_world_size",
-                     return_value=world_size)
-
-        result = _gather_along_last_dim(test_tensor, mocker.MagicMock())
-
-        assert result.shape == expected
-
-    @pytest.mark.parametrize("input_shape,expected_shape", [
-        ((32, 16), (8, 16)),
-        ((40, 10), (10, 10)),
-    ])
-    def test_reduce_scatter_along_first_dim(self, input_shape, expected_shape,
-                                            mocker: MockerFixture):
-        input_tensor = torch.randn(*input_shape)
-        result = _reduce_scatter_along_first_dim(input_tensor,
-                                                 mocker.MagicMock())
-        assert result.shape == expected_shape
-
-    @pytest.mark.parametrize("input_shape,expected_shape", [
-        ((8, 16, 32), (8, 16, 8)),
-    ])
-    def test_reduce_scatter_along_last_dim(self, input_shape, expected_shape,
-                                           mocker: MockerFixture):
-        input_tensor = torch.randn(*input_shape)
-        result = _reduce_scatter_along_last_dim(input_tensor,
-                                                mocker.MagicMock())
-        assert result.shape == expected_shape
-
-    @pytest.mark.parametrize("func,input_shape,expected_shape", [
-        ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
-         (8, 16, 128)),
-        ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
-        ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
-         (8, 16, 8)),
-        ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
-    ])
-    def test_wrapper_functions(self, func, input_shape, expected_shape,
-                               mocker: MockerFixture):
-        """test wrapper funcs"""
-        mod = importlib.import_module(
-            'vllm_ascend.distributed.tensor_parallel')
-        globals = mod.__dict__
-        test_func = globals[func]
-        input_tensor = torch.randn(*input_shape)
-        result = test_func(input_tensor, mocker.MagicMock())
-        assert result.shape == expected_shape
-
-    @pytest.mark.parametrize(
-        "input_shape,output_shape",
-        [
-            ((8, 16), (32, 4)),  # [num_tokens/TP, H] -> [num_tokens, H/TP]
-        ])
-    def test_all_to_all_sp2hp(self, input_shape, output_shape,
-                              mocker: MockerFixture):
-        input_tensor = torch.randn(*input_shape)
-        result = all_to_all_sp2hp(input_tensor, mocker.MagicMock())
-        assert result.shape == output_shape
-
-    @pytest.mark.parametrize(
-        "input_shape,output_shape",
-        [
-            ((32, 4), (8, 16)),  # [num_tokens, H/TP] -> [num_tokens/TP, H]
-        ])
-    def test_all_to_all_hp2sp(self, input_shape, output_shape,
-                              mocker: MockerFixture):
-        input_tensor = torch.randn(*input_shape)
-        result = all_to_all_hp2sp(input_tensor, mocker.MagicMock())
-        assert result.shape == output_shape
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+import importlib
+
+import pytest
+import torch
+from pytest_mock import MockerFixture
+
+from tests.ut.base import PytestBase
+from vllm_ascend.distributed.tensor_parallel import (
+    _gather_along_first_dim, _gather_along_last_dim,
+    _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
+    all_to_all_hp2sp, all_to_all_sp2hp)
+
+
+class TestDistributedCommunication(PytestBase):
+
+    @pytest.fixture(autouse=True)
+    def context(self, mocker: MockerFixture):
+        mocker.patch("torch.npu.current_device", return_value="cpu")
+        mocker.patch("torch.distributed.get_world_size", return_value=4)
+
+        mocker.patch("torch.distributed.get_rank", return_value=0)
+
+    @pytest.mark.parametrize("world_size, test_tensor, expected",
+                             [(1, torch.randn(8, 16), (8, 16)),
+                              (4, torch.randn(8, 16), (32, 16))])
+    def test_gather_along_first_dim(self, test_tensor, expected, world_size,
+                                    mocker: MockerFixture):
+        """test _gather_along_first_dim"""
+        mocker.patch("torch.distributed.get_world_size",
+                     return_value=world_size)
+
+        result = _gather_along_first_dim(test_tensor, mocker.MagicMock())
+
+        assert result.shape == expected
+
+    @pytest.mark.parametrize("test_tensor, output_split_sizes, expected", [
+        (torch.randn(8, 16), [5, 10, 15, 2], (32, 16)),
+    ])
+    def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
+                                                  output_split_sizes,
+                                                  mocker: MockerFixture):
+        """test _gather_along_first_dim"""
+
+        result = _gather_along_first_dim(test_tensor, mocker.MagicMock(),
+                                         output_split_sizes)
+
+        assert result.shape == expected
+
+    @pytest.mark.parametrize("world_size, test_tensor, expected",
+                             [(1, torch.randn(8, 16, 32), (8, 16, 32)),
+                              (4, torch.randn(8, 16, 32), (8, 16, 32 * 4))])
+    def test_gather_along_last_dim(self, test_tensor, expected, world_size,
+                                   mocker: MockerFixture):
+        """test _gather_along_last_dim"""
+        mocker.patch("torch.distributed.get_world_size",
+                     return_value=world_size)
+
+        result = _gather_along_last_dim(test_tensor, mocker.MagicMock())
+
+        assert result.shape == expected
+
+    @pytest.mark.parametrize("input_shape,expected_shape", [
+        ((32, 16), (8, 16)),
+        ((40, 10), (10, 10)),
+    ])
+    def test_reduce_scatter_along_first_dim(self, input_shape, expected_shape,
+                                            mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = _reduce_scatter_along_first_dim(input_tensor,
+                                                 mocker.MagicMock())
+        assert result.shape == expected_shape
+
+    @pytest.mark.parametrize("input_shape,expected_shape", [
+        ((8, 16, 32), (8, 16, 8)),
+    ])
+    def test_reduce_scatter_along_last_dim(self, input_shape, expected_shape,
+                                           mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = _reduce_scatter_along_last_dim(input_tensor,
+                                                mocker.MagicMock())
+        assert result.shape == expected_shape
+
+    @pytest.mark.parametrize("func,input_shape,expected_shape", [
+        ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
+         (8, 16, 128)),
+        ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
+        ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
+         (8, 16, 8)),
+        ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
+    ])
+    def test_wrapper_functions(self, func, input_shape, expected_shape,
+                               mocker: MockerFixture):
+        """test wrapper funcs"""
+        mod = importlib.import_module(
+            'vllm_ascend.distributed.tensor_parallel')
+        globals = mod.__dict__
+        test_func = globals[func]
+        input_tensor = torch.randn(*input_shape)
+        result = test_func(input_tensor, mocker.MagicMock())
+        assert result.shape == expected_shape
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [
+            ((8, 16), (32, 4)),  # [num_tokens/TP, H] -> [num_tokens, H/TP]
+        ])
+    def test_all_to_all_sp2hp(self, input_shape, output_shape,
+                              mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = all_to_all_sp2hp(input_tensor, mocker.MagicMock())
+        assert result.shape == output_shape
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [
+            ((32, 4), (8, 16)),  # [num_tokens, H/TP] -> [num_tokens/TP, H]
+        ])
+    def test_all_to_all_hp2sp(self, input_shape, output_shape,
+                              mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = all_to_all_hp2sp(input_tensor, mocker.MagicMock())
+        assert result.shape == output_shape
diff --git a/tests/ut/test_token_dispatcher.py b/tests/ut/ops/test_token_dispatcher.py
similarity index 100%
rename from tests/ut/test_token_dispatcher.py
rename to tests/ut/ops/test_token_dispatcher.py