vllm-project · wangxiyuan · Jan 23, 2026 · Jan 8, 2026 · Jan 9, 2026 · Jan 9, 2026
@@ -10,7 +10,7 @@ The current process for registering and obtaining quantization methods in vLLM A
 
 ![get_quant_method](../../assets/quantization/get_quant_method.png)
 
-vLLM Ascend registers a custom ascend quantization method. By configuring the `--quantization ascend` parameter (or `quantization="ascend"` for offline), the quantization feature is enabled. When constructing the `quant_config`, the registered `AscendQuantConfig` is initialized and `get_quant_method` is called to obtain the quantization method corresponding to each weight part, stored in the `quant_method` attribute.
+vLLM Ascend registers a custom ascend quantization method. By configuring the `--quantization ascend` parameter (or `quantization="ascend"` for offline), the quantization feature is enabled. When constructing the `quant_config`, the registered `AscendModelSlimConfig` is initialized and `get_quant_method` is called to obtain the quantization method corresponding to each weight part, stored in the `quant_method` attribute.
 
 Currently supported quantization methods include `AscendLinearMethod`, `AscendFusedMoEMethod`, `AscendEmbeddingMethod`, and their corresponding non-quantized methods:
 
@@ -51,26 +51,29 @@ Based on the above content, we present a brief description of the adaptation pro
 ### Quantization Algorithm Adaptation
 
 - **Step 1: Algorithm Design**. Define the algorithm ID (e.g., `W4A8_DYNAMIC`), determine supported layers (linear, moe, attention), and design the quantization scheme (static/dynamic, pertensor/perchannel/pergroup).
-- **Step 2: Registration**. Add the algorithm ID to `ASCEND_QUANTIZATION_METHOD_MAP` in `vllm_ascend/quantization/utils.py` and associate it with the corresponding method class.
+- **Step 2: Registration**. Use the `@register_scheme` decorator in `vllm_ascend/quantization/methods/registry.py` to register your quantization scheme class.
 
 ```python
-ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
-    "W4A8_DYNAMIC": {
-        "linear": AscendW4A8DynamicLinearMethod,
-        "moe": AscendW4A8DynamicFusedMoEMethod,
-    },
-}
+from vllm_ascend.quantization.methods import register_scheme, AscendLinearScheme
+
+@register_scheme("W4A8_DYNAMIC", "linear")
+class AscendW4A8DynamicLinearMethod(AscendLinearScheme):
+    ...
+
+@register_scheme("W4A8_DYNAMIC", "moe")
+class AscendW4A8DynamicFusedMoEMethod(AscendMoEScheme):
+    ...
 ```
 
-- **Step 3: Implementation**. Create an algorithm implementation file, such as `vllm_ascend/quantization/w4a8_dynamic.py`, and implement the method class and logic.
+- **Step 3: Implementation**. Create an algorithm implementation file, such as `vllm_ascend/quantization/methods/w4a8.py`, and implement the method class and logic.
 - **Step 4: Testing**. Use your algorithm to generate quantization configurations and verify correctness and performance on target models and hardware.
 
 ### Quantized Model Adaptation
 
 Adapting a new quantized model requires ensuring the following three points:
 
 - The original model has been successfully adapted in `vLLM Ascend`.
-- **Fused Module Mapping**: Add the model's `model_type` to `packed_modules_model_mapping` in `vllm_ascend/quantization/quant_config.py` (e.g., `qkv_proj`, `gate_up_proj`, `experts`) to ensure sharding consistency and correct loading.
+- **Fused Module Mapping**: Add the model's `model_type` to `packed_modules_model_mapping` in `vllm_ascend/quantization/modelslim_config.py` (e.g., `qkv_proj`, `gate_up_proj`, `experts`) to ensure sharding consistency and correct loading.
 
 ```python
 packed_modules_model_mapping = {

@@ -20,8 +20,6 @@
 import torch.nn as nn
 import torch_npu
 from pytest_mock import MockerFixture
-from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
-
 from tests.ut.base import TestBase
 from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts
@@ -233,25 +231,6 @@ def __init__(self, shared_experts, num_tokens):
             self.apply = MagicMock(return_value=(torch.randn(num_tokens, 32)))
 
 
-class MockFusedMoEMethod(FusedMoEMethodBase):
-    moe = MagicMock()
-
-    def __init__(self):
-        super().__init__(self.moe)
-
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-        pass
-
-    def apply(self, hidden_states: torch.Tensor,
-              expert_weights: torch.Tensor) -> torch.Tensor:
-        pass
-
-    def get_fused_moe_quant_config(self, layer: torch.nn.Module):
-        pass
-
-
 class TestExpertsSelector:
 
     @pytest.mark.parametrize("global_num_experts", [256, 128])

@@ -7,11 +7,11 @@
 
 from tests.ut.base import TestBase
 from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
-from vllm_ascend.quantization.quant_config import AscendQuantConfig
+from vllm_ascend.quantization.modelslim_config import AscendModelSlimConfig
 from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
 
 
-class TestAscendQuantConfig(TestBase):
+class TestAscendModelSlimConfig(TestBase):
 
     def setUp(self):
         self.sample_config = {
@@ -25,7 +25,7 @@ def setUp(self):
             "shard1.weight": "FLOAT",
             "shard2.weight": "FLOAT",
         }
-        self.ascend_config = AscendQuantConfig(self.sample_config)
+        self.ascend_config = AscendModelSlimConfig(self.sample_config)
         self.ascend_config.packed_modules_mapping = None
 
     def test_init(self):
@@ -34,78 +34,80 @@ def test_init(self):
 
     def test_repr(self):
         repr_str = repr(self.ascend_config)
-        self.assertTrue(repr_str.startswith("AscendQuantConfig:\n"))
+        self.assertTrue(repr_str.startswith("AscendModelSlimConfig:\n"))
 
     def test_get_name(self):
-        self.assertEqual(AscendQuantConfig.get_name(),
+        self.assertEqual(AscendModelSlimConfig.get_name(),
                          ASCEND_QUANTIZATION_METHOD)
 
     def test_get_supported_act_dtypes(self):
-        supported_dtypes = AscendQuantConfig.get_supported_act_dtypes()
+        supported_dtypes = AscendModelSlimConfig.get_supported_act_dtypes()
         self.assertEqual(len(supported_dtypes), 3)
 
     def test_get_min_capability(self):
         with self.assertRaises(NotImplementedError):
-            AscendQuantConfig.get_min_capability()
+            AscendModelSlimConfig.get_min_capability()
 
     def test_get_config_filenames(self):
-        filenames = AscendQuantConfig.get_config_filenames()
+        filenames = AscendModelSlimConfig.get_config_filenames()
         self.assertEqual(filenames, ["quant_model_description.json"])
 
     def test_from_config(self):
-        config = AscendQuantConfig.from_config(self.sample_config)
-        self.assertIsInstance(config, AscendQuantConfig)
+        config = AscendModelSlimConfig.from_config(self.sample_config)
+        self.assertIsInstance(config, AscendModelSlimConfig)
         self.assertEqual(config.quant_description, self.sample_config)
 
     @patch('torch.npu.is_available')
     def test_override_quantization_method(self, mock_is_available):
         # Test when NPU is available
         mock_is_available.return_value = True
-        result = AscendQuantConfig.override_quantization_method(None, None)
+        result = AscendModelSlimConfig.override_quantization_method(None, None)
         self.assertIsNone(result)
         hf_quant_cfg = {"quant_method": ""}
-        result = AscendQuantConfig.override_quantization_method(
+        result = AscendModelSlimConfig.override_quantization_method(
             hf_quant_cfg, None)
         self.assertEqual(result, "ascend")
 
         # Test when NPU is not available
         mock_is_available.return_value = False
-        result = AscendQuantConfig.override_quantization_method(None, None)
+        result = AscendModelSlimConfig.override_quantization_method(None, None)
         self.assertIsNone(result)
         hf_quant_cfg = {"quant_method": ""}
-        result = AscendQuantConfig.override_quantization_method(
+        result = AscendModelSlimConfig.override_quantization_method(
             hf_quant_cfg, None)
         self.assertIsNone(result)
 
     def test_get_quant_method_for_linear(self):
         mock_config = MagicMock()
-        mock_config.model_config.hf_text_config.model_type = None
+        mock_config.model_config.hf_config.model_type = None
         linear_layer = MagicMock(spec=LinearBase)
         # Test skipped layer
-        with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
+        with patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config), \
             patch.object(self.ascend_config, \
                           'is_layer_skipped_ascend',
                           return_value=True):
             method = self.ascend_config.get_quant_method(linear_layer, ".attn")
             self.assertIsInstance(method, AscendUnquantizedLinearMethod)
 
         # Test quantized layer
+        mock_scheme = MagicMock()
         with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \
-            patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
-            patch('vllm_ascend.quantization.quant_config.AscendLinearMethod', return_value=MagicMock()) as mock_ascend_linear:
+            patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config), \
+            patch("vllm_ascend.quantization.modelslim_config.create_scheme_for_layer", return_value=mock_scheme), \
+            patch('vllm_ascend.quantization.method_adapters.AscendLinearMethod', return_value=MagicMock()) as mock_ascend_linear:
 
             method = self.ascend_config.get_quant_method(linear_layer, ".attn")
             self.assertIs(method, mock_ascend_linear.return_value)
-            mock_ascend_linear.assert_called_once_with(
-                self.ascend_config, ".attn",
-                self.ascend_config.packed_modules_mapping, linear_layer)
+            mock_ascend_linear.assert_called_once_with(mock_scheme)
 
     def test_get_quant_method_for_attention(self):
         attention_layer = MagicMock(spec=Attention)
         mock_config = MagicMock()
-        mock_config.model_config.hf_text_config.model_type = None
-        with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
-            patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
+        mock_config.model_config.hf_config.model_type = None
+        mock_scheme = MagicMock()
+        with patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config), \
+            patch("vllm_ascend.quantization.modelslim_config.create_scheme_for_layer", return_value=mock_scheme), \
+            patch('vllm_ascend.quantization.method_adapters.AscendKVCacheMethod', \
                    return_value=MagicMock()) as mock_ascend_kvcache:
             # Test with fa_quant_type
             method = self.ascend_config.get_quant_method(
@@ -117,20 +119,22 @@ def test_get_quant_method_for_fused_moe(self):
         fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
         fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig)
         mock_config = MagicMock()
-        mock_config.model_config.hf_text_config.model_type = None
+        mock_config.model_config.hf_config.model_type = None
 
         # Test skipped layer
         with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \
-            patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
-            patch('vllm_ascend.quantization.quant_config.AscendUnquantizedFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
+            patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config), \
+            patch('vllm_ascend.ops.fused_moe.fused_moe.AscendUnquantizedFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
             method = self.ascend_config.get_quant_method(
                 fused_moe_layer, "moe_layer")
             self.assertIs(method, mock_ascend_moe.return_value)
 
         # Test quantized layer
+        mock_scheme = MagicMock()
         with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \
-            patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
-            patch('vllm_ascend.quantization.quant_config.AscendFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
+            patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config), \
+            patch("vllm_ascend.quantization.modelslim_config.create_scheme_for_layer", return_value=mock_scheme), \
+            patch('vllm_ascend.quantization.method_adapters.AscendFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
             method = self.ascend_config.get_quant_method(
                 fused_moe_layer, "moe_layer")
             self.assertIs(method, mock_ascend_moe.return_value)
@@ -150,7 +154,7 @@ def test_is_layer_skipped_ascend(self):
 
         # Test inconsistent fused layer shards
         bad_config = {"shard1.weight": "FLOAT", "shard2.weight": "INT8"}
-        config = AscendQuantConfig(bad_config)
+        config = AscendModelSlimConfig(bad_config)
         with self.assertRaises(ValueError):
             config.is_layer_skipped_ascend("fused_layer", fused_mapping)
 

@@ -3,8 +3,9 @@
 import torch
 
 from tests.ut.base import TestBase
-from vllm_ascend.quantization.w4a16 import (AscendW4A16FusedMoEMethod,
-                                            pack_to_int32, unpack_from_int32)
+from vllm_ascend.quantization.methods.w4a16 import (AscendW4A16FusedMoEMethod,
+                                                    pack_to_int32,
+                                                    unpack_from_int32)
 
 
 class TestUnpackFromInt32(TestBase):
@@ -42,7 +43,7 @@ def test_unpack_from_int32_assertions(self):
 class TestPackToInt32(TestBase):
 
     @patch(
-        "vllm_ascend.quantization.w4a16.torch_npu.npu_convert_weight_to_int4pack"
+        "vllm_ascend.quantization.methods.w4a16.torch_npu.npu_convert_weight_to_int4pack"
     )
     def test_pack_to_int32_int8(self, mock_npu_convert_weight_to_int4pack):
         mock_npu_convert_weight_to_int4pack.return_value = torch.zeros(
@@ -57,7 +58,7 @@ def test_pack_to_int32_int8(self, mock_npu_convert_weight_to_int4pack):
         self.assertEqual(result.shape, torch.Size([2, 8, 4]))
 
     @patch(
-        "vllm_ascend.quantization.w4a16.torch_npu.npu_convert_weight_to_int4pack"
+        "vllm_ascend.quantization.methods.w4a16.torch_npu.npu_convert_weight_to_int4pack"
     )
     def test_pack_to_int32_int32(self, mock_npu_convert_weight_to_int4pack):
 
@@ -97,8 +98,8 @@ class TestAscendW4A16FusedMoEMethod(TestBase):
     output_size = 128
     group_size = 32
 
-    @patch("vllm_ascend.quantization.w4a16.get_ascend_config")
-    @patch("vllm_ascend.quantization.w4a16.get_current_vllm_config")
+    @patch("vllm_ascend.quantization.methods.w4a16.get_ascend_config")
+    @patch("vllm_ascend.quantization.methods.w4a16.get_current_vllm_config")
     def setUp(self, mock_get_current_vllm_config, mock_get_ascend_config):
         mock_ascend_config = Mock()
         mock_ascend_config.eplb_config.dynamic_eplb = False
@@ -218,7 +219,7 @@ def build_layer(self):
         return layer
 
     @patch(
-        "vllm_ascend.quantization.w4a16.torch_npu.npu_convert_weight_to_int4pack"
+        "vllm_ascend.quantization.methods.w4a16.torch_npu.npu_convert_weight_to_int4pack"
     )
     def test_process_weights_after_loading_with_transpose(
             self, mock_npu_convert_weight_to_int4pack):