vllm-project · menogrey · Feb 9, 2026 · Feb 26, 2026 · Feb 27, 2026 · Feb 27, 2026
@@ -12,6 +12,7 @@
       "BAAI/bge-small-en-v1.5",
       "BAAI/kernel_meta",
       "ByteDance-Seed/BAGEL-7B-MoT",
+      "cpatonn-mirror/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit",
       "DeepSeek-ai/DeepSeek-OCR",
       "DevQuasar/deepseek-ai.DeepSeek-V3.2-BF16",
       "Eco-Tech/DeepSeek-V3.1-w8a8-mtp-QuaRot",

@@ -21,67 +21,53 @@
 from tests.e2e.conftest import VllmRunner
 
 
-def test_qwen2_5_w8a8_external_quantized_tp2():
-    example_prompts = [
-        "The president of the United States is",
-    ]
-    max_tokens = 5
-    with VllmRunner(
-            "neuralmagic/Qwen2.5-3B-quantized.w8a8",
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            max_model_len=4096,
-            gpu_memory_utilization=0.8,
-    ) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+TEST_CASES = [
+    pytest.param(
+        "neuralmagic/Qwen2.5-3B-quantized.w8a8",
+        [
+            "The president of the United States is the head of state and",
+        ],
+        id="dense-w8a8",
+    ),
+    pytest.param(
+        "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
+        [
+            "The president of the United States is the head of state and",
+        ],
+        id="moe-w8a8-dynamic",
+    ),
+    pytest.param(
+        "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
+        [
+            "The president of the United States is the head of state and",
+        ],
+        id="moe-w4a8-dynamic",
+    ),
+    pytest.param(
+        "cpatonn-mirror/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit",
+        [
+            "The president of the United States is the head of state and",
+        ],
+        id="moe-w4a16-dynamic",
+    ),
+]
 
-    golden_results = [
-        'The president of the United States is the head of state and',
-    ]
 
-    for i in range(len(vllm_output)):
-        assert golden_results[i] == vllm_output[i][1]
-        print(f"Generated text: {vllm_output[i][1]!r}")
-
-
-def test_qwen3_moe_w8a8_dynamic_llm_compressor():
+@pytest.mark.parametrize("model_id, golden_results", TEST_CASES)
+def test_compressed_tensors_tp2(model_id, golden_results):
     example_prompts = [
         "The president of the United States is",
     ]
     max_tokens = 5
     with VllmRunner(
-            "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
-            tensor_parallel_size=2,
-            max_model_len=4096,
-            gpu_memory_utilization=0.8,
+        model_id,
+        max_model_len=4096,
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        gpu_memory_utilization=0.8,
     ) as vllm_model:
         vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    golden_results = [
-        'The president of the United States is the head of state and',
-    ]
-
-    for i in range(len(vllm_output)):
-        assert golden_results[i] == vllm_output[i][1]
-        print(f"Generated text: {vllm_output[i][1]!r}")
-
-def test_qwen3_moe_w4a8_dynamic_llm_compressor():
-    example_prompts = [
-        "The president of the United States is",
-    ]
-    max_tokens = 5
-    with VllmRunner(
-            "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
-            tensor_parallel_size=2,
-            max_model_len=4096,
-            gpu_memory_utilization=0.8,
-    ) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    golden_results = [
-        'The president of the United States is the head of state and',
-    ]
-
     for i in range(len(vllm_output)):
         assert golden_results[i] == vllm_output[i][1]
         print(f"Generated text: {vllm_output[i][1]!r}")
@@ -3,12 +3,15 @@
 import os
 import tempfile
 from unittest.mock import MagicMock, patch
+import torch
 
 from tests.ut.base import TestBase
 from vllm_ascend.quantization.modelslim_config import MODELSLIM_CONFIG_FILENAME
 from vllm_ascend.quantization.utils import (
     detect_quantization_method,
     maybe_auto_detect_quantization,
+    pack_to_int32,
+    unpack_from_int32,
 )
 from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD
 
@@ -180,3 +183,87 @@ def test_no_detection_emits_no_log(self, mock_detect):
                 maybe_auto_detect_quantization(vllm_config)
 
         self.assertIsNone(vllm_config.model_config.quantization)
+
+
+class TestUnpackFromInt32(TestBase):
+
+    def test_unpack_from_int32_packed_dim_1(self):
+        weight = torch.tensor([[305419896, -1420531520]], dtype=torch.int32)
+        shape = torch.Size([1, 8])
+        num_bits = 4
+
+        result = unpack_from_int32(weight, shape, num_bits, packed_dim=1)
+
+        self.assertEqual(result.dtype, torch.int8)
+        self.assertEqual(result.shape, shape)
+
+    def test_unpack_from_int32_packed_dim_0(self):
+        weight = torch.tensor([[305419896], [-1420531520]], dtype=torch.int32)
+        shape = torch.Size([8, 1])
+        num_bits = 4
+
+        result = unpack_from_int32(weight, shape, num_bits, packed_dim=0)
+
+        self.assertEqual(result.dtype, torch.int8)
+        self.assertEqual(result.shape, shape)
+
+    def test_unpack_from_int32_assertions(self):
+        with self.assertRaises(AssertionError):
+            weight = torch.tensor([[1, 2]], dtype=torch.int64)
+            unpack_from_int32(weight, torch.Size([8, 1]), 4)
+
+        with self.assertRaises(AssertionError):
+            weight = torch.tensor([[1, 2]], dtype=torch.int32)
+            unpack_from_int32(weight, torch.Size([8, 1]), 16)
+
+
+class TestPackToInt32(TestBase):
+
+    @patch(
+        "vllm_ascend.quantization.utils.torch_npu.npu_convert_weight_to_int4pack"
+    )
+    def test_pack_to_int32_int8(self, mock_npu_convert_weight_to_int4pack):
+        mock_npu_convert_weight_to_int4pack.return_value = torch.zeros(
+            (2, 4), dtype=torch.int32)
+
+        weight = torch.zeros((2, 8, 16), dtype=torch.int8)
+        result = pack_to_int32(weight)
+
+        self.assertEqual(result.dtype, torch.int32)
+        mock_npu_convert_weight_to_int4pack.assert_not_called()
+
+        self.assertEqual(result.shape, torch.Size([2, 8, 4]))
+
+    @patch(
+        "vllm_ascend.quantization.utils.torch_npu.npu_convert_weight_to_int4pack"
+    )
+    def test_pack_to_int32_int32(self, mock_npu_convert_weight_to_int4pack):
+
+        def mock_convert_weight(weight):
+            return weight
+
+        mock_npu_convert_weight_to_int4pack.side_effect = mock_convert_weight
+        weight = torch.zeros((2, 8, 8), dtype=torch.int32)
+        result = pack_to_int32(weight)
+
+        self.assertEqual(result.dtype, torch.int32)
+        self.assertEqual(result.shape, weight.shape)
+
+    def test_pack_to_int32_assertion_dim(self):
+        with self.assertRaises(AssertionError):
+            weight = torch.zeros((8, 8), dtype=torch.int8)
+            pack_to_int32(weight)
+
+    def test_pack_to_int32_assertion_dtype(self):
+        with self.assertRaises(AssertionError):
+            weight = torch.zeros((2, 8, 8), dtype=torch.float32)
+            pack_to_int32(weight)
+
+    def test_pack_to_int32_assertion_divisible(self):
+        with self.assertRaises(AssertionError):
+            weight = torch.zeros((2, 8, 7), dtype=torch.int32)
+            pack_to_int32(weight)
+
+        with self.assertRaises(AssertionError):
+            weight = torch.zeros((2, 8, 7), dtype=torch.int8)
+            pack_to_int32(weight)
@@ -3,93 +3,7 @@
 import torch
 
 from tests.ut.base import TestBase
-from vllm_ascend.quantization.methods.w4a16 import (AscendW4A16FusedMoEMethod,
-                                                    pack_to_int32,
-                                                    unpack_from_int32)
-
-
-class TestUnpackFromInt32(TestBase):
-
-    def test_unpack_from_int32_packed_dim_1(self):
-        weight = torch.tensor([[305419896, -1420531520]], dtype=torch.int32)
-        shape = torch.Size([1, 8])
-        num_bits = 4
-
-        result = unpack_from_int32(weight, shape, num_bits, packed_dim=1)
-
-        self.assertEqual(result.dtype, torch.int8)
-        self.assertEqual(result.shape, shape)
-
-    def test_unpack_from_int32_packed_dim_0(self):
-        weight = torch.tensor([[305419896], [-1420531520]], dtype=torch.int32)
-        shape = torch.Size([8, 1])
-        num_bits = 4
-
-        result = unpack_from_int32(weight, shape, num_bits, packed_dim=0)
-
-        self.assertEqual(result.dtype, torch.int8)
-        self.assertEqual(result.shape, shape)
-
-    def test_unpack_from_int32_assertions(self):
-        with self.assertRaises(AssertionError):
-            weight = torch.tensor([[1, 2]], dtype=torch.int64)
-            unpack_from_int32(weight, torch.Size([8, 1]), 4)
-
-        with self.assertRaises(AssertionError):
-            weight = torch.tensor([[1, 2]], dtype=torch.int32)
-            unpack_from_int32(weight, torch.Size([8, 1]), 16)
-
-
-class TestPackToInt32(TestBase):
-
-    @patch(
-        "vllm_ascend.quantization.methods.w4a16.torch_npu.npu_convert_weight_to_int4pack"
-    )
-    def test_pack_to_int32_int8(self, mock_npu_convert_weight_to_int4pack):
-        mock_npu_convert_weight_to_int4pack.return_value = torch.zeros(
-            (2, 4), dtype=torch.int32)
-
-        weight = torch.zeros((2, 8, 16), dtype=torch.int8)
-        result = pack_to_int32(weight)
-
-        self.assertEqual(result.dtype, torch.int32)
-        mock_npu_convert_weight_to_int4pack.assert_not_called()
-
-        self.assertEqual(result.shape, torch.Size([2, 8, 4]))
-
-    @patch(
-        "vllm_ascend.quantization.methods.w4a16.torch_npu.npu_convert_weight_to_int4pack"
-    )
-    def test_pack_to_int32_int32(self, mock_npu_convert_weight_to_int4pack):
-
-        def mock_convert_weight(weight):
-            return weight
-
-        mock_npu_convert_weight_to_int4pack.side_effect = mock_convert_weight
-        weight = torch.zeros((2, 8, 8), dtype=torch.int32)
-        result = pack_to_int32(weight)
-
-        self.assertEqual(result.dtype, torch.int32)
-        self.assertEqual(result.shape, weight.shape)
-
-    def test_pack_to_int32_assertion_dim(self):
-        with self.assertRaises(AssertionError):
-            weight = torch.zeros((8, 8), dtype=torch.int8)
-            pack_to_int32(weight)
-
-    def test_pack_to_int32_assertion_dtype(self):
-        with self.assertRaises(AssertionError):
-            weight = torch.zeros((2, 8, 8), dtype=torch.float32)
-            pack_to_int32(weight)
-
-    def test_pack_to_int32_assertion_divisible(self):
-        with self.assertRaises(AssertionError):
-            weight = torch.zeros((2, 8, 7), dtype=torch.int32)
-            pack_to_int32(weight)
-
-        with self.assertRaises(AssertionError):
-            weight = torch.zeros((2, 8, 7), dtype=torch.int8)
-            pack_to_int32(weight)
+from vllm_ascend.quantization.methods.w4a16 import AscendW4A16FusedMoEMethod
 
 
 class TestAscendW4A16FusedMoEMethod(TestBase):
@@ -219,7 +133,7 @@ def build_layer(self):
         return layer
 
     @patch(
-        "vllm_ascend.quantization.methods.w4a16.torch_npu.npu_convert_weight_to_int4pack"
+        "vllm_ascend.quantization.utils.torch_npu.npu_convert_weight_to_int4pack"
     )
     def test_process_weights_after_loading_with_transpose(
             self, mock_npu_convert_weight_to_int4pack):

@@ -315,7 +315,11 @@ def __init__(self, *args, **kwargs):
             "weight_loader": self.weight_loader,
         }
         # need full intermediate size pre-sharding for WNA16 act order
-        if self.quant_method.__class__.__name__ in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod"):
+        if self.quant_method.__class__.__name__ in (
+            "GPTQMarlinMoEMethod",
+            "CompressedTensorsWNA16MarlinMoEMethod",
+            "CompressedTensorsWNA16MoEMethod",
+        ):
             moe_quant_params["intermediate_size_full"] = intermediate_size
         self.quant_method.create_weights(layer=self, **moe_quant_params)
 

@@ -38,8 +38,10 @@ def __init__(
         vllm_config = get_current_vllm_config()
         self.bias = None
         # quantization with anti_method m4 will generate none-zero norm bias
-        if vllm_config.quant_config is not None and any(
-            "norm.bias" in name for name in vllm_config.quant_config.quant_description
+        if (
+            vllm_config.quant_config is not None
+            and hasattr(vllm_config.quant_config, "quant_description")
+            and any("norm.bias" in name for name in vllm_config.quant_config.quant_description)
         ):
             self.bias = torch.nn.Parameter(torch.zeros(hidden_size), requires_grad=False)
 

@@ -35,3 +35,4 @@
 import vllm_ascend.patch.worker.patch_routed_experts_capturer  # noqa
 import vllm_ascend.patch.worker.patch_npugraph_ex_triton  # noqa
 import vllm_ascend.patch.worker.patch_kimi_k25  # noqa
+import vllm_ascend.patch.worker.patch_quantization  # noqa