Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions tests/e2e/310p/multicard/test_moe_model_multicard.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,17 @@ def test_qwen3_moe_ep4_fp16():
enable_expert_parallel=True
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)

def test_qwen3_moe_tp2_w8a8():
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
"vllm-ascend/Qwen3-30B-A3B-W8A8",
tensor_parallel_size=2,
Comment thread
pu-zhe marked this conversation as resolved.
enforce_eager=True,
dtype="float16",
quantization="ascend"
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
42 changes: 42 additions & 0 deletions tests/ut/_310p/fused_moe/test_experts_selector_310.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#
# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest
import torch

from vllm_ascend._310p.fused_moe.experts_selector import select_experts


class TestExpertsSelector310:
@pytest.mark.parametrize("global_num_experts", [256, 128])
def test_select_experts(self, global_num_experts):
x = torch.randn(8, 2)
router_logits = torch.randn(8, 2)
topk_weights, topk_ids = select_experts(
hidden_states=x,
router_logits=router_logits,
top_k=2,
use_grouped_topk=False,
renormalize=True,
topk_group=None,
num_expert_group=None,
custom_routing_function=None,
scoring_func="softmax",
e_score_correction_bias=None,
global_num_experts=global_num_experts,
)

assert topk_weights.shape == (8, 2)
assert topk_ids.shape == (8, 2)
132 changes: 132 additions & 0 deletions tests/ut/_310p/fused_moe/test_moe_mlp_310.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#
# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest.mock import call, patch

import torch

from tests.ut.base import TestBase
from vllm_ascend._310p.fused_moe.moe_mlp import unified_apply_mlp


class TestUnifiedApplyMLP310(TestBase):
@patch("torch_npu.npu_grouped_matmul")
@patch("torch_npu.npu_swiglu")
def test_unified_apply_mlp_without_quantization_310(self, mock_npu_swiglu, mock_npu_grouped_matmul):
mock_gmm1_out = torch.randn(10, 40, dtype=torch.float16)
mock_gmm2_out = torch.randn(10, 20, dtype=torch.float16)
mock_npu_grouped_matmul.side_effect = [[mock_gmm1_out], [mock_gmm2_out]]

mock_npu_swiglu_output = torch.randn(10, 40, dtype=torch.float16)
mock_npu_swiglu.return_value = mock_npu_swiglu_output

hidden_states = torch.randn(10, 20, dtype=torch.float16)
w1 = torch.randn(5, 20, 40, dtype=torch.float16)
w2 = torch.randn(5, 40, 20, dtype=torch.float16)
group_list = torch.tensor([2, 4, 6, 8, 10], dtype=torch.int64)

result = unified_apply_mlp(
hidden_states=hidden_states,
w1=w1,
w1_scale=None,
w2=w2,
w2_scale=None,
group_list=group_list,
group_list_type=1,
with_quant=False,
)

self.assertEqual(mock_npu_grouped_matmul.call_count, 2)
mock_npu_grouped_matmul.assert_has_calls(
[
call(
x=[hidden_states], weight=[w1], split_item=2, group_list_type=1, group_type=0, group_list=group_list
),
call(
x=[mock_npu_swiglu_output],
weight=[w2],
split_item=2,
group_list_type=1,
group_type=0,
group_list=group_list,
),
],
any_order=True,
)
mock_npu_swiglu.assert_called_once()
mock_npu_swiglu.assert_called_with(mock_gmm1_out)

self.assertEqual(result.shape, hidden_states.shape)
self.assertEqual(result.dtype, torch.float16)

@patch("torch.cumsum")
@patch("torch_npu.npu_quant_grouped_matmul_dequant")
@patch("torch_npu.npu_swiglu")
def test_unified_apply_mlp_with_quantization_310(
self, mock_npu_swiglu, mock_npu_quant_grouped_matmul_dequant, mock_cumsum
):
mock_cumsum_out = torch.arange(0, 10, dtype=torch.int64)
mock_cumsum.return_value = mock_cumsum_out
mock_gmm1_out = torch.randn(10, 40, dtype=torch.float16)
mock_gmm2_out = torch.randn(10, 20, dtype=torch.float16)
mock_npu_quant_grouped_matmul_dequant.side_effect = [mock_gmm1_out, mock_gmm2_out]

mock_npu_swiglu_output = torch.randn(10, 40, dtype=torch.float16)
mock_npu_swiglu.return_value = mock_npu_swiglu_output

hidden_states = torch.randn(10, 20, dtype=torch.float16)
w1 = torch.randn(5, 20, 40, dtype=torch.float16)
w1_scale = torch.rand(5, 40, dtype=torch.float32)
w2 = torch.randn(5, 40, 20, dtype=torch.float16)
w2_scale = torch.rand(5, 40, dtype=torch.float32)
group_list = torch.tensor([2, 4, 6, 8, 10], dtype=torch.int64)

result = unified_apply_mlp(
hidden_states=hidden_states,
w1=w1,
w1_scale=w1_scale,
w2=w2,
w2_scale=w2_scale,
group_list=group_list,
group_list_type=1,
with_quant=True,
)

mock_cumsum.assert_called_once()
self.assertEqual(mock_npu_quant_grouped_matmul_dequant.call_count, 2)
mock_npu_quant_grouped_matmul_dequant.assert_has_calls(
[
call(
x=hidden_states,
quantized_weight=w1,
weight_scale=w1_scale,
group_list=mock_cumsum_out,
quant_mode="pertoken",
),
call(
x=mock_npu_swiglu_output,
quantized_weight=w2,
weight_scale=w2_scale,
group_list=mock_cumsum_out,
quant_mode="pertoken",
),
],
any_order=True,
)
mock_npu_swiglu.assert_called_once()
mock_npu_swiglu.assert_called_with(mock_gmm1_out)

self.assertEqual(result.shape, hidden_states.shape)
self.assertEqual(result.dtype, torch.float16)
47 changes: 42 additions & 5 deletions tests/ut/_310p/quantization/test_modelslim_config_310.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,26 @@
#
# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest.mock import MagicMock, patch

from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig, FusedMoEParallelConfig
from vllm.model_executor.layers.linear import LinearBase

from tests.ut.base import TestBase
from vllm_ascend._310p.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod310
from vllm_ascend._310p.quantization.modelslim_config import AscendModelSlimConfig310
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod

Expand All @@ -31,7 +47,7 @@ def test_get_quant_method_for_linear_310(self):
# Test skipped layer
with (
patch("vllm_ascend._310p.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config),
patch.object(self.ascend_config, "is_layer_skipped_ascend", return_value=True)
patch.object(self.ascend_config, "is_layer_skipped_ascend", return_value=True),
):
method = self.ascend_config.get_quant_method(linear_layer, ".attn")
self.assertIsInstance(method, AscendUnquantizedLinearMethod)
Expand All @@ -54,14 +70,35 @@ def test_get_quant_method_for_fused_moe_310(self):
fused_moe_layer = MagicMock(spec=FusedMoE)
fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig)
fused_moe_layer.moe_config.moe_parallel_config = MagicMock(spec=FusedMoEParallelConfig)
fused_moe_layer.moe_config.moe_parallel_config.use_ep = True
fused_moe_layer.moe_config.moe_parallel_config.dp_size = 1
mock_config = MagicMock()
mock_config.model_config.hf_config.model_type = None
mock_config.compilation_config.custom_ops = ["all"]
mock_scheme = MagicMock()
# Test skipped layer
with (
patch("vllm.config.vllm.get_current_vllm_config", return_value=mock_config),
patch("vllm_ascend._310p.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config),
patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config),
patch.object(self.ascend_config, "is_layer_skipped_ascend", return_value=True),
):
method = self.ascend_config.get_quant_method(fused_moe_layer, ".moe")
self.assertIsInstance(method, AscendUnquantizedFusedMoEMethod310)

# Test quantized layer
mock_scheme = MagicMock()
with (
patch.object(self.ascend_config, "is_layer_skipped_ascend", return_value=False),
patch("vllm.config.vllm.get_current_vllm_config", return_value=mock_config),
patch("vllm_ascend._310p.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config),
patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config),
patch("vllm_ascend._310p.quantization.modelslim_config.create_scheme_for_layer", return_value=mock_scheme),
patch("vllm_ascend._310p.quantization.modelslim_config.AscendLinearMethod", return_value=MagicMock()),
self.assertRaises(NotImplementedError),
patch(
"vllm_ascend._310p.quantization.modelslim_config.AscendFusedMoEMethod", return_value=MagicMock()
) as fused_moe_method,
):
self.ascend_config.get_quant_method(fused_moe_layer, "moe_layer")
method = self.ascend_config.get_quant_method(fused_moe_layer, ".moe")
self.assertIs(method, fused_moe_method.return_value)
fused_moe_method.assert_called_once_with(mock_scheme, fused_moe_layer.moe_config)
66 changes: 66 additions & 0 deletions tests/ut/_310p/quantization/test_w8a8_dynamic_310.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#
# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest.mock import Mock, patch

import torch

from tests.ut.base import TestBase
from vllm_ascend._310p.quantization.methods.w8a8_dynamic import AscendW8A8DynamicFusedMoEMethod310


class TestAscendW8A8FusedMoEMethod310(TestBase):
num_experts = 8
hidden_size = 128
intermediate_size = 128

@patch("vllm_ascend._310p.quantization.methods.w8a8_dynamic.get_ep_group")
def setUp(self, mock_get_ep_group):
with patch(
"vllm_ascend._310p.quantization.methods.w8a8_dynamic.get_current_vllm_config"
) as mock_get_current_vllm_config:
mock_vllm_config = Mock()
mock_vllm_config.quant_config = Mock(quant_description={"group_size": 0})
mock_vllm_config.scheduler_config = Mock(
max_num_batched_tokens=2048, max_model_len=2048, enable_chunked_prefill=False
)
mock_get_current_vllm_config.return_value = mock_vllm_config
mock_ep_group = Mock()
mock_get_ep_group.return_value = mock_ep_group
mock_ascend_config = Mock()

mock_ascend_config.enable_chunked_prefill = False

self.quant_method = AscendW8A8DynamicFusedMoEMethod310()

def test_get_weight_310(self):
param_dict = self.quant_method.get_weight(
self.num_experts, self.intermediate_size, self.hidden_size, torch.float16
)
self.assertEqual(param_dict["w13_weight"].dtype, torch.int8)
self.assertEqual(
param_dict["w13_weight"].shape, (self.num_experts, 2 * self.intermediate_size, self.hidden_size)
)
self.assertEqual(param_dict["w2_weight"].dtype, torch.int8)
self.assertEqual(param_dict["w2_weight"].shape, (self.num_experts, self.hidden_size, self.intermediate_size))

def test_get_dynamic_quant_param_310(self):
param_dict = self.quant_method.get_dynamic_quant_param(
self.num_experts, self.intermediate_size, self.hidden_size, torch.float16
)
self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.float32)
self.assertEqual(param_dict["w13_weight_scale"].shape, (self.num_experts, 2 * self.intermediate_size, 1))
self.assertEqual(param_dict["w2_weight_scale"].dtype, torch.float32)
self.assertEqual(param_dict["w2_weight_scale"].shape, (self.num_experts, self.hidden_size, 1))
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
#
# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest.mock import MagicMock, patch

import torch
Expand All @@ -16,19 +31,19 @@ def test_get_weight_310(self):
self.assertEqual(weight["weight"].shape, (20, 10))

def test_get_pertensor_param_310(self):
params = self.method.get_pertensor_param(torch.bfloat16)
self.assertEqual(params["input_scale"].dtype, torch.bfloat16)
params = self.method.get_pertensor_param(torch.float16)
self.assertEqual(params["input_scale"].dtype, torch.float16)
self.assertEqual(params["input_offset"].dtype, torch.int8)
self.assertEqual(params["input_scale"].shape, (1,))
self.assertEqual(params["input_offset"].shape, (1,))

def test_get_perchannel_param_310(self):
params = self.method.get_perchannel_param(10, torch.bfloat16)
params = self.method.get_perchannel_param(10, torch.float16)

self.assertEqual(params["quant_bias"].dtype, torch.int32)
self.assertEqual(params["deq_scale"].dtype, torch.float32)
self.assertEqual(params["weight_scale"].dtype, torch.bfloat16)
self.assertEqual(params["weight_offset"].dtype, torch.bfloat16)
self.assertEqual(params["deq_scale"].dtype, torch.int64)
self.assertEqual(params["weight_scale"].dtype, torch.float16)
self.assertEqual(params["weight_offset"].dtype, torch.float16)
self.assertEqual(params["quant_bias"].shape, (10,))
self.assertEqual(params["deq_scale"].shape, (10,))
self.assertEqual(params["weight_scale"].shape, (10, 1))
Expand Down
4 changes: 0 additions & 4 deletions vllm_ascend/_310p/fused_moe/experts_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import torch

from vllm_ascend.ops.fused_moe.experts_selector import _native_select_experts
from vllm_ascend.utils import get_weight_prefetch_method


def select_experts(
Expand Down Expand Up @@ -55,9 +54,6 @@ def select_experts(
topk_weights: router weights of shape (num_tokens, top_k).
topk_ids: selected expert IDs of shape (num_tokens, top_k).
"""
# prefetch w1_w3_proj.weight preprocess
weight_prefetch_method = get_weight_prefetch_method()
weight_prefetch_method.maybe_prefetch_moe_weight_preprocess(hidden_states, "gate_up")
topk_weights, topk_ids = _native_select_experts(
hidden_states=hidden_states,
router_logits=router_logits,
Expand Down
Loading