From b41642affaa2ba35fc7e8ef960464ddb77163f00 Mon Sep 17 00:00:00 2001
From: LHXuuu <scut_xlh@163.com>
Date: Thu, 13 Nov 2025 14:14:37 +0800
Subject: [PATCH 01/19] support compressed tensors w8a8 static and dynamic
 quantization

Signed-off-by: LHXuuu <scut_xlh@163.com>
---
 vllm_ascend/platform.py                       |   9 +-
 .../compressed_tensors/__init__.py            |   0
 .../compressed_tensors/compressed_tensors.py  | 300 ++++++++++++++++++
 .../compressed_tensors/schemes/__init__.py    |   7 +
 .../schemes/compressed_tensors_w8a8.py        | 141 ++++++++
 .../compressed_tensors_w8a8_dynamic.py        |  89 ++++++
 vllm_ascend/quantization/quant_config.py      |   3 +-
 vllm_ascend/utils.py                          |   1 +
 vllm_ascend/worker/worker_v1.py               |   2 +
 9 files changed, 549 insertions(+), 3 deletions(-)
 create mode 100644 vllm_ascend/quantization/compressed_tensors/__init__.py
 create mode 100644 vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
 create mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/__init__.py
 create mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
 create mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py

diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 5559df8c02f..8aa1d4b4c0e 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -30,7 +30,8 @@
                                        init_ascend_config)
 from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
                                         delete_torchair_cache_file)
-from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
+from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
+                               COMPRESSED_TENSORS_METHOD, enable_sp, is_310p,
                                prefill_context_parallel_enable,
                                update_aclgraph_sizes,
                                update_cudagraph_capture_sizes,
@@ -55,7 +56,9 @@ class NPUPlatform(Platform):
     device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
     dispatch_key: str = "PrivateUse1"
 
-    supported_quantization: list[str] = [ASCEND_QUANTIZATION_METHOD]
+    supported_quantization: list[str] = [
+        ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD
+    ]
 
     def is_sleep_mode_available(self) -> bool:
         return True
@@ -78,6 +81,8 @@ def pre_register_and_update(cls,
                 if ASCEND_QUANTIZATION_METHOD not in quant_action.choices:
                     quant_action.choices.append(ASCEND_QUANTIZATION_METHOD)
 
+        from vllm_ascend.quantization.compressed_tensors.compressed_tensors import \
+            AscendCompressedTensorsConfig  # noqa: F401
         from vllm_ascend.quantization.quant_config import \
             AscendQuantConfig  # noqa: F401
 
diff --git a/vllm_ascend/quantization/compressed_tensors/__init__.py b/vllm_ascend/quantization/compressed_tensors/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
new file mode 100644
index 00000000000..7938d910da3
--- /dev/null
+++ b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
@@ -0,0 +1,300 @@
+from typing import TYPE_CHECKING, Any, Optional, cast
+
+import torch
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import (
+    QUANTIZATION_METHODS, register_quantization_config)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import \
+    CompressedTensorsScheme
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    find_matched_target, is_activation_quantization_format,
+    should_ignore_layer)
+
+from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
+
+from .schemes.compressed_tensors_w8a8 import CompressedTensorsW8A8
+from .schemes.compressed_tensors_w8a8_dynamic import \
+    CompressedTensorsW8A8Dynamic
+
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
+logger = init_logger(__name__)
+
+QUANTIZATION_SCHEME_MAP_TYPE = dict[str, Optional[dict[str, QuantizationArgs]]]
+
+
+def remove_quantization_method():
+    if COMPRESSED_TENSORS_METHOD in QUANTIZATION_METHODS:
+        QUANTIZATION_METHODS.remove(COMPRESSED_TENSORS_METHOD)
+
+
+remove_quantization_method()
+
+
+@register_quantization_config(COMPRESSED_TENSORS_METHOD)
+class AscendCompressedTensorsConfig(QuantizationConfig):
+
+    def __init__(
+        self,
+        target_scheme_map: dict[str, Any],
+        ignore: list[str],
+        quant_format: str,
+        config: Optional[dict[str, Any]] = None,
+    ):
+        super().__init__()
+        self.ignore = ignore
+        self.quant_format = quant_format
+        # Map from [target -> scheme]
+        self.target_scheme_map = target_scheme_map
+        self.quant_description = config
+
+    def get_name(self) -> str:
+        return "compressed-tensors"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.int8, torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "Ascend hardware dose not support \"get_min_capability\" feature.")
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str,
+                                      Any]) -> "AscendCompressedTensorsConfig":
+        ignore: list[str] = cast(list[str], config.get("ignore", []))
+        quant_format = cast(str, config.get("format"))
+        target_scheme_map = cls._quantization_scheme_map_from_config(
+            config=config)
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            config=config,
+        )
+
+    @classmethod
+    def _quantization_scheme_map_from_config(
+            cls, config: dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            quantization_args for weights and input activations
+        """
+        target_scheme_map: dict[str, Any] = dict()
+        quant_format = cast(str, config.get("format"))
+
+        # The quant_config has multiple config_groups, each containing
+        # an input_activations key with details about how the activations are
+        # quantized, a weights key indicating how the weights are quantized,
+        # and a list of targets under the `targets` key, dictating which
+        # layers are impacted by the quantization details. The quantization
+        # details follow the structure defined by the QuantizationArgs
+        # pydantic model, which is used to verify the structure of the
+        # quant_config and also store the details for later use.
+
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
+            targets = quant_config.get("targets")
+            for target in targets:
+                target_scheme_map[target] = {}
+                target_scheme_map[target][
+                    "weights"] = QuantizationArgs.model_validate(
+                        quant_config.get("weights"))
+
+                target_scheme_map[target]["input_activations"] = None
+                target_scheme_map[target]["format"] = quant_config.get(
+                    "format")
+                format = target_scheme_map[target].get("format")
+                # If no per-config format defined, use global format in config
+                act_quant_format = (
+                    is_activation_quantization_format(format)
+                    if format is not None else
+                    is_activation_quantization_format(quant_format))
+                input_activations = quant_config.get("input_activations")
+                if act_quant_format and input_activations is not None:
+                    target_scheme_map[target]["input_activations"] = (
+                        QuantizationArgs.model_validate(
+                            quant_config.get("input_activations")))
+        return target_scheme_map
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            # collect schemes
+            quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
+
+            # choose quantization method
+            quant_method: LinearMethodBase = UnquantizedLinearMethod()
+            if quant_scheme is not None:
+                layer.scheme = quant_scheme
+                quant_method = AscendCompressedTensorsLinearMethod(self)
+            return quant_method
+        return None
+
+    def get_scheme(self,
+                   layer: torch.nn.Module,
+                   layer_name: Optional[str] = None
+                   ) -> Optional["CompressedTensorsScheme"]:
+        """
+        compressed-tensors supports non uniform in the following way:
+
+        targets of config_groups: There can be N config_groups which each
+            have a quantization scheme. Each config_group has a list of targets
+            which can be a full layer_name, a regex for a layer_name, or
+            an nn.Module name.
+
+        Detect whether a layer_name is found in any target and
+        use the quantization scheme corresponding to the matched target
+        to select the CompressedTensorsScheme used for inference.
+        """
+
+        # Find the "target" in the compressed-tensors config
+        # that our layer conforms to.
+        if should_ignore_layer(layer_name,
+                               ignore=self.ignore,
+                               fused_mapping=self.packed_modules_mapping):
+            return None
+
+        # Will be empty for models with only sparsity
+        weight_quant = input_quant = None
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys(),
+                fused_mapping=self.packed_modules_mapping,
+            )
+
+            scheme_dict = self.target_scheme_map[matched_target]
+            weight_quant = scheme_dict.get("weights")
+            input_quant = scheme_dict.get("input_activations")
+
+        if weight_quant is None:
+            logger.warning_once("Acceleration for non-quantized schemes is "
+                                "not supported by Compressed Tensors. "
+                                "Falling back to UnquantizedLinearMethod")
+            return None
+
+        else:
+            # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+            )
+        return scheme
+
+    def _get_scheme_from_parts(
+            self, weight_quant: QuantizationArgs,
+            input_quant: QuantizationArgs) -> "CompressedTensorsScheme":
+        act_quant_format = is_activation_quantization_format(self.quant_format)
+        if act_quant_format and input_quant is not None:
+            if self._is_static_tensor_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8()
+
+            if self._is_dynamic_token_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8Dynamic()
+
+        raise NotImplementedError(
+            "No compressed-tensors compatible scheme was found.")
+
+    def _is_static_tensor_w8a8(self, weight_quant: QuantizationArgs,
+                               input_quant: QuantizationArgs) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_tensor = (weight_strategy and input_quant.strategy
+                     == QuantizationStrategy.TENSOR.value)
+        is_static = not weight_quant.dynamic and not input_quant.dynamic
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+
+        # Only symmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_tensor and is_symmetric and is_static
+
+    def _is_dynamic_token_w8a8(self, weight_quant: QuantizationArgs,
+                               input_quant: QuantizationArgs) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_token = (weight_strategy and input_quant.strategy
+                    == QuantizationStrategy.TOKEN.value)
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+
+        # Only symmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_token and is_symmetric and is_dynamic
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        self.target_scheme_map = hf_to_vllm_mapper.apply_dict(
+            self.target_scheme_map)
+        self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
+
+
+class AscendCompressedTensorsLinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: AscendCompressedTensorsConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """
+        Use the CompressedTensorsScheme associated with each layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.scheme.create_weights(
+            layer=layer,
+            input_size=input_size,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        """
+        Use the output of create_weights and the CompressedTensorsScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, x, bias=bias)
diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py b/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py
new file mode 100644
index 00000000000..7f334daf711
--- /dev/null
+++ b/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .compressed_tensors_w8a8 import CompressedTensorsW8A8
+from .compressed_tensors_w8a8_dynamic import CompressedTensorsW8A8Dynamic
+
+__all__ = ["CompressedTensorsW8A8", "CompressedTensorsW8A8Dynamic"]
\ No newline at end of file
diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
new file mode 100644
index 00000000000..e9c64509b0e
--- /dev/null
+++ b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
@@ -0,0 +1,141 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional
+
+import torch
+import torch_npu
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import \
+    CompressedTensorsScheme
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, is_enable_nz
+
+logger = init_logger(__name__)
+
+
+def quant_per_tensor(in_tensor: torch.Tensor,
+                     input_scale: torch.Tensor,
+                     input_offset: torch.Tensor,
+                     function=False):
+    return torch_npu.npu_quantize(in_tensor, input_scale, input_offset,
+                                  torch.qint8, -1, function)
+
+
+class CompressedTensorsW8A8(CompressedTensorsScheme):
+
+    def __init__(self) -> None:
+        # aclnn quant matmul requires to transpose matrix B, set to true by default.
+        self.transpose_weight = not is_310p()
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "Ascend hardware dose not support \"get_min_capability\" feature.")
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        self.output_partition_sizes = output_partition_sizes
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # WEIGHT
+        weight = ModelWeightParameter(
+            data=torch.empty(output_size_per_partition,
+                             input_size_per_partition,
+                             dtype=torch.int8),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((output_size_per_partition, 1),
+                             dtype=params_dtype),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        input_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=params_dtype),
+            weight_loader=weight_loader,
+        )
+        input_scale[:] = torch.finfo(params_dtype).min
+        layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        if x.dtype != torch.int8:
+            x = quant_per_tensor(
+                x,
+                layer.aclnn_input_scale_reciprocal,
+                None,
+            )
+
+        if is_310p():
+            # On 300I Duo platform, we need transpose again if
+            # using nz. This transpose can be skipped in torchair.
+            output = torch_npu.npu_quant_matmul(
+                x,
+                layer.weight.data.transpose(1, 0),
+                layer.deq_scale,
+                bias=bias,
+                output_dtype=layer.params_dtype,
+            )
+        else:
+            output = torch_npu.npu_quant_matmul(
+                x,
+                layer.weight,
+                layer.deq_scale,
+                bias=bias,
+                output_dtype=layer.params_dtype,
+            )
+        return output
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.input_scale = torch.nn.Parameter(layer.input_scale.max(),
+                                               requires_grad=False)
+        expanding_factor = layer.weight.data.shape[1]
+        layer.aclnn_input_scale = torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor),
+            requires_grad=False)
+        layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor),
+            requires_grad=False)
+        if self.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        if is_enable_nz():
+            layer.weight.data = torch_npu.npu_format_cast(
+                layer.weight.data, ACL_FORMAT_FRACTAL_NZ)
+        layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
+        deq_scale = layer.input_scale.data * layer.weight_scale.data
+        layer.deq_scale = torch.nn.Parameter(deq_scale, requires_grad=False)
diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py
new file mode 100644
index 00000000000..58dcc010cff
--- /dev/null
+++ b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import List, Optional
+
+import torch
+import torch_npu
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import \
+    CompressedTensorsScheme
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           ModelWeightParameter)
+
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz
+
+logger = init_logger(__name__)
+
+
+class CompressedTensorsW8A8Dynamic(CompressedTensorsScheme):
+
+    def __init__(self) -> None:
+        # aclnn quant matmul requires to transpose matrix B, set to true by default.
+        self.transpose_weight = True
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "Ascend hardware dose not support \"get_min_capability\" feature.")
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # WEIGHT
+        weight = ModelWeightParameter(
+            data=torch.empty(output_size_per_partition,
+                             input_size_per_partition,
+                             dtype=torch.int8),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((output_size_per_partition, 1),
+                             dtype=params_dtype),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        if not isinstance(x, tuple):
+            output_dtype = x.dtype
+            quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
+        else:
+            output_dtype = layer.weight_scale.dtype
+            quantized_x, dynamic_scale = x
+
+        output = torch_npu.npu_quant_matmul(
+            quantized_x,
+            layer.weight,
+            layer.weight_scale,
+            pertoken_scale=dynamic_scale,
+            bias=bias,
+            output_dtype=output_dtype,
+        )
+        return output
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if self.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        # cast quantized weight tensors in NZ format for higher inference speed
+        if is_enable_nz():
+            layer.weight.data = torch_npu.npu_format_cast(
+                layer.weight.data, ACL_FORMAT_FRACTAL_NZ)
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index c0760c800ed..86eff0f9e6a 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -94,7 +94,8 @@ def from_config(cls, config: Dict[str, Any]) -> "AscendQuantConfig":
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
                                      user_quant) -> Optional[str]:
-        if torch.npu.is_available():
+        quant_method = hf_quant_cfg.get("quant_method", None)
+        if quant_method is None and torch.npu.is_available():
             return ASCEND_QUANTIZATION_METHOD
         return None
 
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 381510809a8..3952f9672ed 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -41,6 +41,7 @@
     VllmConfig = None
 
 ASCEND_QUANTIZATION_METHOD = "ascend"
+COMPRESSED_TENSORS_METHOD = "compressed-tensors"
 SOC_VERSION_INFERENCE_SERIES = ["Ascend310P3"]
 REGISTERED_ASCEND_OPS = {}
 
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
index 58ac27a0d27..559030e4b3d 100644
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -157,6 +157,8 @@ def __init__(
         # FixMe: this is a patch to fix the issue cause by https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170
         from vllm.model_executor.layers.linear import \
             WEIGHT_LOADER_V2_SUPPORTED
+        WEIGHT_LOADER_V2_SUPPORTED.append(
+            "AscendCompressedTensorsLinearMethod")
         if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
             WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
 

From d7133b32ba9a55e4bf9c377480ab1cb097ec4e01 Mon Sep 17 00:00:00 2001
From: LHXuuu <scut_xlh@163.com>
Date: Thu, 20 Nov 2025 14:50:08 +0800
Subject: [PATCH 02/19] Refactoring the quantization functionality to enable
 LLM Compressor to reuse ModelSlim code for quantization

Signed-off-by: LHXuuu <scut_xlh@163.com>
---
 docs/source/user_guide/feature_guide/index.md |   1 +
 .../quantization-llm-compressor.md            |  56 ++++++
 .../quantization/llm-compressor/w8a8_int8.py  | 181 ++++++++++++++++++
 .../llm-compressor/w8a8_int8_dynamic.py       |  83 ++++++++
 pyproject.toml                                |   1 +
 requirements.txt                              |   1 +
 .../compressed_tensors/compressed_tensors.py  |  18 +-
 .../compressed_tensors/schemes/__init__.py    |   7 -
 .../schemes/compressed_tensors_w8a8.py        | 141 --------------
 .../compressed_tensors_w8a8_dynamic.py        |  89 ---------
 vllm_ascend/quantization/quant_config.py      |  33 ++--
 vllm_ascend/quantization/utils.py             |  27 ++-
 vllm_ascend/quantization/w8a8.py              |  14 +-
 13 files changed, 392 insertions(+), 260 deletions(-)
 create mode 100644 docs/source/user_guide/feature_guide/quantization-llm-compressor.md
 create mode 100644 examples/quantization/llm-compressor/w8a8_int8.py
 create mode 100644 examples/quantization/llm-compressor/w8a8_int8_dynamic.py
 delete mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/__init__.py
 delete mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
 delete mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py

diff --git a/docs/source/user_guide/feature_guide/index.md b/docs/source/user_guide/feature_guide/index.md
index b0c0fd7d462..3fa4f8f995a 100644
--- a/docs/source/user_guide/feature_guide/index.md
+++ b/docs/source/user_guide/feature_guide/index.md
@@ -7,6 +7,7 @@ This section provides a detailed usage guide of vLLM Ascend features.
 :maxdepth: 1
 graph_mode
 quantization
+quantization-llm-compressor
 sleep_mode
 structured_output
 lora
diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
new file mode 100644
index 00000000000..7fad89589d1
--- /dev/null
+++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
@@ -0,0 +1,56 @@
+# Quantization Guide
+
+Model quantization is a technique that reduces the size and computational requirements of a model by lowering the data precision of the weights and activation values in the model, thereby saving the memory and improving the inference speed.
+
+## Install llm-compressor
+
+To quantize a model, you should install [llm-compressor](https://github.com/vllm-project/llm-compressor/blob/main/README.md). It is a unified library for creating compressed models for faster inference with vLLM.
+
+Install llm-compressor:
+
+```bash
+pip install llmcompressor
+```
+
+### Generate the W8A8 weights
+
+```bash
+cd examples/quantization/llm-compressor
+
+python3 w8a8_int8_dynamic.py
+```
+
+for more details, see the [Official Sample](https://github.com/vllm-project/llm-compressor/tree/main/examples).
+
+## Run the model
+
+Now, you can run the quantized model with vLLM Ascend. Examples for online and offline inference are provided as follows:
+
+### Offline inference
+
+```python
+import torch
+
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "Hello, my name is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40)
+
+llm = LLM(model="{quantized_model_save_path}",
+          max_model_len=2048,
+          trust_remote_code=True)
+
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+### Online inference
+
+Start the quantized model using vLLM Ascend; no modifications to the startup command are required.
+
diff --git a/examples/quantization/llm-compressor/w8a8_int8.py b/examples/quantization/llm-compressor/w8a8_int8.py
new file mode 100644
index 00000000000..80899ba7220
--- /dev/null
+++ b/examples/quantization/llm-compressor/w8a8_int8.py
@@ -0,0 +1,181 @@
+import os
+import torch
+
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, \
+    AutoTokenizer, AutoProcessor, AutoConfig, AutoImageProcessor
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
+from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme, QuantizationType, QuantizationStrategy
+from qwen_vl_utils import process_vision_info
+
+W8A8_W_cha_A_ten_static_symmetric = {
+    "group_0": QuantizationScheme(
+        targets=["Linear"],
+        weights=QuantizationArgs(
+            num_bits=8,
+            type=QuantizationType.INT,
+            strategy=QuantizationStrategy.CHANNEL,
+            symmetric=True,
+            dynamic=False
+        ),
+        input_activations=QuantizationArgs(
+            num_bits=8,
+            type=QuantizationType.INT,
+            strategy=QuantizationStrategy.TENSOR,
+            symmetric=True,
+            dynamic=False
+        ),
+    ),
+}
+
+# supported modifiers
+MODIFIER_DICT = {
+    "PTQ": QuantizationModifier,
+    "AWQ": AWQModifier,
+    "GPTQ": GPTQModifier,
+}
+
+# supported schemes
+SCHEMES_DICT = {
+    "W8A8_W_cha_A_ten_static_symmetric": W8A8_W_cha_A_ten_static_symmetric,
+}
+
+MODEL_DICT = {
+    "qwen3": AutoModelForCausalLM,
+}
+
+TOKENIZER_DICT = {
+    "qwen3": AutoTokenizer,
+}
+
+
+def load_environment_variables():
+    env_vars = {
+        'model_path': os.getenv('MODEL_PATH'),
+        'export_path': os.getenv('EXPORT_PATH'),
+        'modifier': os.getenv('MODIFIER'),
+        'schemes': os.getenv('SCHEMES'),
+        'calib_prompt_path': os.getenv('CALIB_PROMPT_PATH')
+    }
+
+    # verify export model path
+    if env_vars['export_path'] is None:
+        env_vars['export_path'] = env_vars['model_path'].rstrip("/") + "-" + env_vars['modifier']
+        if env_vars['schemes'] is not None:
+            env_vars['export_path'] += "-" + env_vars['schemes']
+    os.makedirs(env_vars['export_path'], exist_ok=True)
+
+    return env_vars
+
+
+def load_calibration_text_dataset(calib_prompt_path, tokenizer):
+    # Load dataset
+    for f in os.listdir(calib_prompt_path):
+        print(f)
+    if any(f.lower().endswith('.jsonl') for f in os.listdir(calib_prompt_path)):
+        ds = load_dataset('json', data_dir=calib_prompt_path, split='validation')
+    elif any(f.lower().endswith('.parquet') for f in os.listdir(calib_prompt_path)):
+        ds = load_dataset("parquet", data_dir=calib_prompt_path, split="train[:512]")
+    else:
+        raise ValueError("Unsupported calibration file format: {}".format(
+            calib_prompt_path.split('.')[-1]))
+
+    # Preprocess dataset
+    def preprocess(example):
+        if tokenizer.chat_template is not None:
+            return {"text": tokenizer.apply_chat_template(
+                example["messages"], tokenize=False)}
+        else:
+            return {"text": example["messages"]}
+
+    # Tokenize inputs
+    def tokenize(sample):
+        return tokenizer(
+            sample["text"],
+            add_special_tokens=False,
+        )
+
+    ds = ds.map(preprocess)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    return ds
+
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {
+        key: torch.tensor(value, dtype=torch.bfloat16 if key == "pixel_values" else torch.long)
+        for key, value in batch[0].items()
+    }
+
+
+def quantize_model(model, config, env_vars, is_vl_model, dataset_dict=None):
+    # since the MoE gate layers are sensitive to quantization, we add them to the ignore
+    # list so they remain at full precision
+    ignore = ["lm_head", "re:.*mlp.down_proj"]
+
+    # define a llmcompressor recipe
+    recipe = [
+        MODIFIER_DICT[env_vars['modifier']](
+            config_groups=SCHEMES_DICT[env_vars['schemes']],
+            ignore=ignore,
+        ),
+    ]
+
+    if env_vars['modifier'] == 'PTQ':
+        oneshot(
+            model=model,
+            recipe=recipe,
+            trust_remote_code_model=True,
+        )
+    elif is_vl_model:
+        # quantize the model
+        oneshot(
+            model=model,
+            dataset=dataset_dict,
+            recipe=recipe,
+            data_collator=data_collator,
+            trust_remote_code_model=True,
+        )
+    else:
+        # quantize the model
+        oneshot(
+            model=model,
+            dataset=dataset_dict,
+            recipe=recipe,
+            trust_remote_code_model=True,
+        )
+
+
+def save_quantized_model(model, tokenizer, save_path, save_compressed=False):
+    model.save_pretrained(save_path, save_compressed=save_compressed)
+    tokenizer.save_pretrained(save_path)
+
+
+if __name__ == '__main__':
+    # get environment variables
+    env_vars = load_environment_variables()
+
+    # support model type list
+    config = AutoConfig.from_pretrained(env_vars['model_path'], trust_remote_code=True)
+    model_type = config.model_type
+
+    model = MODEL_DICT[model_type].from_pretrained(
+        env_vars['model_path'], torch_dtype="auto", trust_remote_code=True
+    )
+    tokenizer = TOKENIZER_DICT[model_type].from_pretrained(env_vars['model_path'], trust_remote_code=True)
+
+    # Load the calibration dataset
+    if env_vars["calib_prompt_path"] is None:
+        env_vars["calib_prompt_path"] = "dataset/ultrachat_200k"
+
+    ds = load_calibration_text_dataset(env_vars["calib_prompt_path"], tokenizer)
+
+    # Quantize the model
+    quantize_model(model, config, env_vars, is_vl_model, ds)
+
+    # save the quantized model
+    save_quantized_model(model, tokenizer, env_vars['export_path'], True)
\ No newline at end of file
diff --git a/examples/quantization/llm-compressor/w8a8_int8_dynamic.py b/examples/quantization/llm-compressor/w8a8_int8_dynamic.py
new file mode 100644
index 00000000000..1cc9d21c663
--- /dev/null
+++ b/examples/quantization/llm-compressor/w8a8_int8_dynamic.py
@@ -0,0 +1,83 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# Select model and load it.
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure algorithms. In this case, we:
+#   * apply SmoothQuant to make the activations easier to quantize
+#   * quantize the weights to int8 with GPTQ (static per channel)
+#   * quantize the activations to int8 (dynamic per token)
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+]
+
+# Apply algorithms and save to output_dir
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("npu")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic-Per-Token"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 7f90b1edb4e..d5d939f6870 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,7 @@ requires = [
     "quart",
     "numba",
     "opencv-python-headless<=4.11.0.86", # Required to avoid numpy version conflict with vllm
+    "compressed_tensors"
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 936de5f6b31..124be0bee6c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,6 +16,7 @@ torchvision
 wheel
 pandas-stubs
 opencv-python-headless<=4.11.0.86 # Required to avoid numpy version conflict with vllm
+compressed_tensors
 
 # requirements for disaggregated prefill
 msgpack
diff --git a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
index 7938d910da3..cc279cf8b74 100644
--- a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
@@ -16,12 +16,12 @@
     find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
 
+from vllm_ascend.quantization.quant_config import (AscendLinearMethod,
+                                                   AscendQuantConfig)
+from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
+from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
 from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
 
-from .schemes.compressed_tensors_w8a8 import CompressedTensorsW8A8
-from .schemes.compressed_tensors_w8a8_dynamic import \
-    CompressedTensorsW8A8Dynamic
-
 if TYPE_CHECKING:
     from vllm.model_executor.models.utils import WeightsMapper
 
@@ -137,6 +137,7 @@ def get_quant_method(
         prefix: str,
     ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
+            layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
             # collect schemes
             quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
 
@@ -144,7 +145,10 @@ def get_quant_method(
             quant_method: LinearMethodBase = UnquantizedLinearMethod()
             if quant_scheme is not None:
                 layer.scheme = quant_scheme
-                quant_method = AscendCompressedTensorsLinearMethod(self)
+                ascend_quant_config = AscendQuantConfig(
+                                        self.quant_description)
+                quant_method = AscendLinearMethod(ascend_quant_config,
+                                                  prefix, None, layer)
             return quant_method
         return None
 
@@ -206,10 +210,10 @@ def _get_scheme_from_parts(
         act_quant_format = is_activation_quantization_format(self.quant_format)
         if act_quant_format and input_quant is not None:
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8()
+                return AscendW8A8LinearMethod()
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8Dynamic()
+                return AscendW8A8DynamicLinearMethod()
 
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py b/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py
deleted file mode 100644
index 7f334daf711..00000000000
--- a/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from .compressed_tensors_w8a8 import CompressedTensorsW8A8
-from .compressed_tensors_w8a8_dynamic import CompressedTensorsW8A8Dynamic
-
-__all__ = ["CompressedTensorsW8A8", "CompressedTensorsW8A8Dynamic"]
\ No newline at end of file
diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
deleted file mode 100644
index e9c64509b0e..00000000000
--- a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import List, Optional
-
-import torch
-import torch_npu
-from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import \
-    CompressedTensorsScheme
-from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
-                                           ModelWeightParameter,
-                                           PerTensorScaleParameter)
-
-from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, is_enable_nz
-
-logger = init_logger(__name__)
-
-
-def quant_per_tensor(in_tensor: torch.Tensor,
-                     input_scale: torch.Tensor,
-                     input_offset: torch.Tensor,
-                     function=False):
-    return torch_npu.npu_quantize(in_tensor, input_scale, input_offset,
-                                  torch.qint8, -1, function)
-
-
-class CompressedTensorsW8A8(CompressedTensorsScheme):
-
-    def __init__(self) -> None:
-        # aclnn quant matmul requires to transpose matrix B, set to true by default.
-        self.transpose_weight = not is_310p()
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        raise NotImplementedError(
-            "Ascend hardware dose not support \"get_min_capability\" feature.")
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ) -> None:
-        self.output_partition_sizes = output_partition_sizes
-        output_size_per_partition = sum(output_partition_sizes)
-        weight_loader = extra_weight_attrs.get("weight_loader")
-
-        # WEIGHT
-        weight = ModelWeightParameter(
-            data=torch.empty(output_size_per_partition,
-                             input_size_per_partition,
-                             dtype=torch.int8),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight", weight)
-
-        # WEIGHT SCALE
-        weight_scale = ChannelQuantScaleParameter(
-            data=torch.empty((output_size_per_partition, 1),
-                             dtype=params_dtype),
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight_scale", weight_scale)
-
-        # INPUT SCALE
-        input_scale = PerTensorScaleParameter(
-            data=torch.empty(len(output_partition_sizes), dtype=params_dtype),
-            weight_loader=weight_loader,
-        )
-        input_scale[:] = torch.finfo(params_dtype).min
-        layer.register_parameter("input_scale", input_scale)
-
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
-        if x.dtype != torch.int8:
-            x = quant_per_tensor(
-                x,
-                layer.aclnn_input_scale_reciprocal,
-                None,
-            )
-
-        if is_310p():
-            # On 300I Duo platform, we need transpose again if
-            # using nz. This transpose can be skipped in torchair.
-            output = torch_npu.npu_quant_matmul(
-                x,
-                layer.weight.data.transpose(1, 0),
-                layer.deq_scale,
-                bias=bias,
-                output_dtype=layer.params_dtype,
-            )
-        else:
-            output = torch_npu.npu_quant_matmul(
-                x,
-                layer.weight,
-                layer.deq_scale,
-                bias=bias,
-                output_dtype=layer.params_dtype,
-            )
-        return output
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        layer.input_scale = torch.nn.Parameter(layer.input_scale.max(),
-                                               requires_grad=False)
-        expanding_factor = layer.weight.data.shape[1]
-        layer.aclnn_input_scale = torch.nn.Parameter(
-            layer.input_scale.data.repeat(expanding_factor),
-            requires_grad=False)
-        layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
-            layer.input_scale.data.repeat(expanding_factor),
-            requires_grad=False)
-        if self.transpose_weight:
-            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        if is_enable_nz():
-            layer.weight.data = torch_npu.npu_format_cast(
-                layer.weight.data, ACL_FORMAT_FRACTAL_NZ)
-        layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
-        deq_scale = layer.input_scale.data * layer.weight_scale.data
-        layer.deq_scale = torch.nn.Parameter(deq_scale, requires_grad=False)
diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py
deleted file mode 100644
index 58dcc010cff..00000000000
--- a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import List, Optional
-
-import torch
-import torch_npu
-from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import \
-    CompressedTensorsScheme
-from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
-                                           ModelWeightParameter)
-
-from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz
-
-logger = init_logger(__name__)
-
-
-class CompressedTensorsW8A8Dynamic(CompressedTensorsScheme):
-
-    def __init__(self) -> None:
-        # aclnn quant matmul requires to transpose matrix B, set to true by default.
-        self.transpose_weight = True
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        raise NotImplementedError(
-            "Ascend hardware dose not support \"get_min_capability\" feature.")
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ) -> None:
-        output_size_per_partition = sum(output_partition_sizes)
-        weight_loader = extra_weight_attrs.get("weight_loader")
-
-        # WEIGHT
-        weight = ModelWeightParameter(
-            data=torch.empty(output_size_per_partition,
-                             input_size_per_partition,
-                             dtype=torch.int8),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight", weight)
-
-        # WEIGHT SCALE
-        weight_scale = ChannelQuantScaleParameter(
-            data=torch.empty((output_size_per_partition, 1),
-                             dtype=params_dtype),
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight_scale", weight_scale)
-
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
-        if not isinstance(x, tuple):
-            output_dtype = x.dtype
-            quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
-        else:
-            output_dtype = layer.weight_scale.dtype
-            quantized_x, dynamic_scale = x
-
-        output = torch_npu.npu_quant_matmul(
-            quantized_x,
-            layer.weight,
-            layer.weight_scale,
-            pertoken_scale=dynamic_scale,
-            bias=bias,
-            output_dtype=output_dtype,
-        )
-        return output
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        if self.transpose_weight:
-            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        # cast quantized weight tensors in NZ format for higher inference speed
-        if is_enable_nz():
-            layer.weight.data = torch_npu.npu_format_cast(
-                layer.weight.data, ACL_FORMAT_FRACTAL_NZ)
-        layer.weight_scale.data = layer.weight_scale.data.flatten()
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index 86eff0f9e6a..383488f7f94 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -114,7 +114,7 @@ def get_quant_method(self, layer: torch.nn.Module,
                                             self.packed_modules_mapping):
                 return AscendUnquantizedLinearMethod()
             return AscendLinearMethod(self, prefix,
-                                      self.packed_modules_mapping)
+                                      self.packed_modules_mapping, layer)
         elif isinstance(layer, Attention) and \
             'fa_quant_type' in self.quant_description.keys() and \
             self.quant_description['fa_quant_type'] is not None:
@@ -127,13 +127,13 @@ def get_quant_method(self, layer: torch.nn.Module,
                                             self.packed_modules_mapping):
                 return AscendUnquantizedFusedMoEMethod(layer.moe_config)
             return AscendFusedMoEMethod(self, prefix,
-                                        self.packed_modules_mapping)
+                                        self.packed_modules_mapping, layer)
         elif isinstance(layer, VocabParallelEmbedding):
             if self.is_layer_skipped_ascend(prefix,
                                             self.packed_modules_mapping):
                 return UnquantizedEmbeddingMethod()
             return AscendEmbeddingMethod(self, prefix,
-                                         self.packed_modules_mapping)
+                                         self.packed_modules_mapping, layer)
         return None
 
     def is_layer_skipped_ascend(
@@ -259,10 +259,13 @@ class AscendLinearMethod(LinearMethodBase):
     """
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str,
-                 packed_modules_mapping: Dict[str, Any]) -> None:
+                 packed_modules_mapping: Dict[str, Any],
+                 layer: torch.nn.Module) -> None:
         self.quant_method = get_quant_method(quant_config.quant_description,
-                                             prefix, "linear",
-                                             packed_modules_mapping)
+                                             prefix,
+                                             "linear",
+                                             packed_modules_mapping,
+                                             layer=layer)
 
     def create_weights(
         self,
@@ -401,10 +404,13 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
     """
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str,
-                 packed_modules_mapping: Dict[str, Any]):
+                 packed_modules_mapping: Dict[str, Any],
+                 layer: torch.nn.Module):
         self.quant_method = get_quant_method(quant_config.quant_description,
-                                             prefix, "moe",
-                                             packed_modules_mapping)
+                                             prefix,
+                                             "moe",
+                                             packed_modules_mapping,
+                                             layer=layer)
 
     def create_weights(
         self,
@@ -484,7 +490,10 @@ class AscendEmbeddingMethod(AscendLinearMethod):
     """
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str,
-                 packed_modules_mapping: Dict[str, Any]) -> None:
+                 packed_modules_mapping: Dict[str, Any],
+                 layer: torch.nn.Module) -> None:
         self.quant_method = get_quant_method(quant_config.quant_description,
-                                             prefix, "linear",
-                                             packed_modules_mapping)
+                                             prefix,
+                                             "linear",
+                                             packed_modules_mapping,
+                                             layer=layer)
diff --git a/vllm_ascend/quantization/utils.py b/vllm_ascend/quantization/utils.py
index 6d914c0dade..33c979e236e 100644
--- a/vllm_ascend/quantization/utils.py
+++ b/vllm_ascend/quantization/utils.py
@@ -1,7 +1,10 @@
 from typing import Any, Dict, Optional, Type
 
+import torch
 from vllm.logger import logger
 
+from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
+
 from .w4a4_flatquant_dynamic import AscendW4A4FlatQuantDynamicLinearMethod
 from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
                            AscendW4A8DynamicLinearMethod)
@@ -60,8 +63,28 @@ def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str,
 def get_quant_method(quant_description: Dict[str, Any],
                      prefix: str,
                      layer_type: str,
-                     packed_modules_mapping: Optional[Dict[str, Any]] = None):
-    logger.info_once("Using the vLLM Ascend Quantization now!")
+                     packed_modules_mapping: Optional[Dict[str, Any]],
+                     layer: torch.nn.Module = None):
+    if quant_description.get("quant_method") == COMPRESSED_TENSORS_METHOD:
+        return get_quant_method_llmcompressor(layer)
+
+    return get_quant_method_modelslim(quant_description, prefix, layer_type,
+                                      packed_modules_mapping)
+
+
+def get_quant_method_llmcompressor(layer: torch.nn.Module):
+    logger.info_once("Using the vLLM Ascend llmcompressor Quantization now!")
+    if layer.scheme is None:
+        raise ValueError("A scheme must be defined for each layer")
+    return layer.scheme
+
+
+def get_quant_method_modelslim(
+        quant_description: Dict[str, Any],
+        prefix: str,
+        layer_type: str,
+        packed_modules_mapping: Optional[Dict[str, Any]] = None):
+    logger.info_once("Using the vLLM Ascend modelslim Quantization now!")
     if packed_modules_mapping is None:
         packed_modules_mapping = dict()
     # Attention
diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py
index dcd692acfb6..1566d631e9c 100644
--- a/vllm_ascend/quantization/w8a8.py
+++ b/vllm_ascend/quantization/w8a8.py
@@ -25,8 +25,9 @@
 
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts
-from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, is_enable_nz
-
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
+                               COMPRESSED_TENSORS_METHOD, is_310p,
+                               is_enable_nz)
 
 def quant_per_tensor(in_tensor: torch.Tensor,
                      input_scale: torch.Tensor,
@@ -147,6 +148,10 @@ def apply(
                 )
 
         quant_bias = layer.quant_bias if tp_rank == 0 else None
+        if getattr(layer, "ascend_quant_method",
+                   "") == COMPRESSED_TENSORS_METHOD:
+            quant_bias = bias
+
         if is_310p():
             # On 300I Duo platform, we need transpose again if
             # using nz. This transpose can be skipped in torchair.
@@ -185,6 +190,11 @@ def process_weights_after_loading(self, layer):
                 layer.weight.data, ACL_FORMAT_FRACTAL_NZ)
         layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
         layer.weight_offset.data = torch.flatten(layer.weight_offset.data)
+        if getattr(layer, "ascend_quant_method",
+                   "") == COMPRESSED_TENSORS_METHOD:
+            deq_scale = layer.input_scale.data * layer.weight_scale.data
+            layer.deq_scale = torch.nn.Parameter(deq_scale,
+                                                 requires_grad=False)
 
 
 class AscendW8A8FusedMoEMethod:

From 84750597acaa011b72b12ab21bc12f431ab5712d Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Fri, 21 Nov 2025 20:41:19 +0800
Subject: [PATCH 03/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 .github/workflows/_e2e_test.yaml              |  1 +
 .../quantization-llm-compressor.md            |  1 -
 .../quantization/llm-compressor/w8a8_int8.py  | 45 +++++---------
 mypy.ini                                      |  9 +++
 requirements.txt                              |  2 +-
 tests/e2e/multicard/test_quantization.py      | 40 +++++++++++++
 tests/ut/quantization/test_quant_config.py    |  4 +-
 .../compressed_tensors/compressed_tensors.py  | 60 ++-----------------
 vllm_ascend/quantization/quant_config.py      | 23 ++++---
 vllm_ascend/quantization/utils.py             |  2 +-
 vllm_ascend/quantization/w8a8.py              |  1 +
 vllm_ascend/worker/worker_v1.py               |  2 -
 12 files changed, 88 insertions(+), 102 deletions(-)
 create mode 100644 tests/e2e/multicard/test_quantization.py

diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index be5b43e6373..b8dd9871164 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -203,6 +203,7 @@ jobs:
           pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
           pytest -sv tests/e2e/multicard/test_prefix_caching.py
           pytest -sv tests/e2e/multicard/test_qwen3_moe.py
+          pytest -sv tests/e2e/multicard/test_quantization.py
 
   e2e-4-cards:
     name: multicard-4
diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
index 7fad89589d1..c523ac4ff72 100644
--- a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
+++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
@@ -53,4 +53,3 @@ for output in outputs:
 ### Online inference
 
 Start the quantized model using vLLM Ascend; no modifications to the startup command are required.
-
diff --git a/examples/quantization/llm-compressor/w8a8_int8.py b/examples/quantization/llm-compressor/w8a8_int8.py
index 80899ba7220..f5e812c12df 100644
--- a/examples/quantization/llm-compressor/w8a8_int8.py
+++ b/examples/quantization/llm-compressor/w8a8_int8.py
@@ -9,7 +9,6 @@
 from llmcompressor.modifiers.awq import AWQModifier
 from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
 from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme, QuantizationType, QuantizationStrategy
-from qwen_vl_utils import process_vision_info
 
 W8A8_W_cha_A_ten_static_symmetric = {
     "group_0": QuantizationScheme(
@@ -54,11 +53,11 @@
 
 def load_environment_variables():
     env_vars = {
-        'model_path': os.getenv('MODEL_PATH'),
-        'export_path': os.getenv('EXPORT_PATH'),
-        'modifier': os.getenv('MODIFIER'),
-        'schemes': os.getenv('SCHEMES'),
-        'calib_prompt_path': os.getenv('CALIB_PROMPT_PATH')
+        'model_path': "Qwen3-32B",
+        'export_path': "/llm-compressor/export/GPTQ/W8A8_W_cha_A_ten_static_symmetric",
+        'modifier': "GPTQ",
+        'schemes': "W8A8_W_cha_A_ten_static_symmetric",
+        'calib_prompt_path': "dataset/ultrachat_200k"
     }
 
     # verify export model path
@@ -112,7 +111,7 @@ def data_collator(batch):
     }
 
 
-def quantize_model(model, config, env_vars, is_vl_model, dataset_dict=None):
+def quantize_model(model, env_vars, dataset_dict=None):
     # since the MoE gate layers are sensitive to quantization, we add them to the ignore
     # list so they remain at full precision
     ignore = ["lm_head", "re:.*mlp.down_proj"]
@@ -125,29 +124,13 @@ def quantize_model(model, config, env_vars, is_vl_model, dataset_dict=None):
         ),
     ]
 
-    if env_vars['modifier'] == 'PTQ':
-        oneshot(
-            model=model,
-            recipe=recipe,
-            trust_remote_code_model=True,
-        )
-    elif is_vl_model:
-        # quantize the model
-        oneshot(
-            model=model,
-            dataset=dataset_dict,
-            recipe=recipe,
-            data_collator=data_collator,
-            trust_remote_code_model=True,
-        )
-    else:
-        # quantize the model
-        oneshot(
-            model=model,
-            dataset=dataset_dict,
-            recipe=recipe,
-            trust_remote_code_model=True,
-        )
+    # quantize the model
+    oneshot(
+        model=model,
+        dataset=dataset_dict,
+        recipe=recipe,
+        trust_remote_code_model=True,
+    )
 
 
 def save_quantized_model(model, tokenizer, save_path, save_compressed=False):
@@ -175,7 +158,7 @@ def save_quantized_model(model, tokenizer, save_path, save_compressed=False):
     ds = load_calibration_text_dataset(env_vars["calib_prompt_path"], tokenizer)
 
     # Quantize the model
-    quantize_model(model, config, env_vars, is_vl_model, ds)
+    quantize_model(model, env_vars, ds)
 
     # save the quantized model
     save_quantized_model(model, tokenizer, env_vars['export_path'], True)
\ No newline at end of file
diff --git a/mypy.ini b/mypy.ini
index 6fe8e6c2986..cdd99e92e0d 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -13,4 +13,13 @@ ignore_missing_imports = True
 ignore_missing_imports = True
 
 [mypy-lm_eval.*]
+ignore_missing_imports = True
+
+[mypy-compressed_tensors.*]
+ignore_missing_imports = True
+
+[mypy-datasets.*]
+ignore_missing_imports = True
+
+[mypy-llmcompressor.*]
 ignore_missing_imports = True
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 124be0bee6c..566b6f89f3e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,7 +16,7 @@ torchvision
 wheel
 pandas-stubs
 opencv-python-headless<=4.11.0.86 # Required to avoid numpy version conflict with vllm
-compressed_tensors
+compressed_tensors>=0.11.0
 
 # requirements for disaggregated prefill
 msgpack
diff --git a/tests/e2e/multicard/test_quantization.py b/tests/e2e/multicard/test_quantization.py
new file mode 100644
index 00000000000..f14fd15b3cc
--- /dev/null
+++ b/tests/e2e/multicard/test_quantization.py
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/e2e/multicard/test_quantization.py`.
+"""
+from modelscope import snapshot_download  # type: ignore
+
+from tests.e2e.conftest import VllmRunner
+
+
+def test_models_distributed_quantized_W8A8():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+    with VllmRunner(snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
+                    tensor_parallel_size=4,
+                    max_model_len=4096,
+                    gpu_memory_utilization=0.8,
+                    distributed_executor_backend="mp",
+                    enforce_eager=True) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+        del vllm_model
diff --git a/tests/ut/quantization/test_quant_config.py b/tests/ut/quantization/test_quant_config.py
index 4622692dd00..b667767ba79 100644
--- a/tests/ut/quantization/test_quant_config.py
+++ b/tests/ut/quantization/test_quant_config.py
@@ -65,7 +65,7 @@ def test_override_quantization_method(self, mock_is_available):
         # Test when NPU is available
         mock_is_available.return_value = True
         result = AscendQuantConfig.override_quantization_method(None, None)
-        self.assertEqual(result, ASCEND_QUANTIZATION_METHOD)
+        self.assertIsNone(result)
 
         # Test when NPU is not available
         mock_is_available.return_value = False
@@ -93,7 +93,7 @@ def test_get_quant_method_for_linear(self):
             self.assertIs(method, mock_ascend_linear.return_value)
             mock_ascend_linear.assert_called_once_with(
                 self.ascend_config, ".attn",
-                self.ascend_config.packed_modules_mapping)
+                self.ascend_config.packed_modules_mapping, linear_layer)
 
     def test_get_quant_method_for_attention(self):
         attention_layer = MagicMock(spec=Attention)
diff --git a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
index cc279cf8b74..f95ff7f0215 100644
--- a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
@@ -145,10 +145,10 @@ def get_quant_method(
             quant_method: LinearMethodBase = UnquantizedLinearMethod()
             if quant_scheme is not None:
                 layer.scheme = quant_scheme
-                ascend_quant_config = AscendQuantConfig(
-                                        self.quant_description)
-                quant_method = AscendLinearMethod(ascend_quant_config,
-                                                  prefix, None, layer)
+                ascend_quant_config = AscendQuantConfig(self.quant_description
+                                                        or {})
+                quant_method = AscendLinearMethod(ascend_quant_config, prefix,
+                                                  None, layer)
             return quant_method
         return None
 
@@ -250,55 +250,3 @@ def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
         self.target_scheme_map = hf_to_vllm_mapper.apply_dict(
             self.target_scheme_map)
         self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
-
-
-class AscendCompressedTensorsLinearMethod(LinearMethodBase):
-
-    def __init__(self, quantization_config: AscendCompressedTensorsConfig):
-        self.quantization_config = quantization_config
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        layer.scheme.process_weights_after_loading(layer)
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: list[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        """
-        Use the CompressedTensorsScheme associated with each layer to create
-        the necessary parameters for the layer. See LinearMethodBase for param
-        details
-        """
-        weight_loader = extra_weight_attrs.get("weight_loader")
-        layer.scheme.create_weights(
-            layer=layer,
-            input_size=input_size,
-            input_size_per_partition=input_size_per_partition,
-            output_partition_sizes=output_partition_sizes,
-            output_size=output_size,
-            params_dtype=params_dtype,
-            weight_loader=weight_loader,
-        )
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ):
-        """
-        Use the output of create_weights and the CompressedTensorsScheme
-        associated with the layer to apply the forward pass with the
-        layer input.  See LinearMethodBase for param details
-
-        """
-        scheme = layer.scheme
-        if scheme is None:
-            raise ValueError("A scheme must be defined for each layer")
-        return scheme.apply_weights(layer, x, bias=bias)
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index 383488f7f94..72c04e50b70 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -94,9 +94,10 @@ def from_config(cls, config: Dict[str, Any]) -> "AscendQuantConfig":
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
                                      user_quant) -> Optional[str]:
-        quant_method = hf_quant_cfg.get("quant_method", None)
-        if quant_method is None and torch.npu.is_available():
-            return ASCEND_QUANTIZATION_METHOD
+        if hf_quant_cfg is not None:
+            quant_method = hf_quant_cfg.get("quant_method", None)
+            if quant_method is None and torch.npu.is_available():
+                return ASCEND_QUANTIZATION_METHOD
         return None
 
     def get_quant_method(self, layer: torch.nn.Module,
@@ -223,6 +224,8 @@ def get_scaled_act_names(self) -> List[str]:
         ],
         "gate_up_proj": ["gate_proj", "up_proj"],
         "in_proj": ["in_proj_qkvz", "in_proj_ba"],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
     },
     "qwen2_5_vl": {
         "qkv_proj": [
@@ -258,9 +261,11 @@ class AscendLinearMethod(LinearMethodBase):
         quant_config: The Ascend quantization config.
     """
 
-    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
-                 packed_modules_mapping: Dict[str, Any],
-                 layer: torch.nn.Module) -> None:
+    def __init__(self,
+                 quant_config: AscendQuantConfig,
+                 prefix: str,
+                 packed_modules_mapping: Dict[str, Any] | None,
+                 layer: torch.nn.Module = None) -> None:
         self.quant_method = get_quant_method(quant_config.quant_description,
                                              prefix,
                                              "linear",
@@ -403,9 +408,11 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
         quant_config: The Ascend quantization config.
     """
 
-    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
+    def __init__(self,
+                 quant_config: AscendQuantConfig,
+                 prefix: str,
                  packed_modules_mapping: Dict[str, Any],
-                 layer: torch.nn.Module):
+                 layer: torch.nn.Module = None):
         self.quant_method = get_quant_method(quant_config.quant_description,
                                              prefix,
                                              "moe",
diff --git a/vllm_ascend/quantization/utils.py b/vllm_ascend/quantization/utils.py
index 33c979e236e..eaaaee86702 100644
--- a/vllm_ascend/quantization/utils.py
+++ b/vllm_ascend/quantization/utils.py
@@ -63,7 +63,7 @@ def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str,
 def get_quant_method(quant_description: Dict[str, Any],
                      prefix: str,
                      layer_type: str,
-                     packed_modules_mapping: Optional[Dict[str, Any]],
+                     packed_modules_mapping: Optional[Dict[str, Any]] = None,
                      layer: torch.nn.Module = None):
     if quant_description.get("quant_method") == COMPRESSED_TENSORS_METHOD:
         return get_quant_method_llmcompressor(layer)
diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py
index 1566d631e9c..96957597b61 100644
--- a/vllm_ascend/quantization/w8a8.py
+++ b/vllm_ascend/quantization/w8a8.py
@@ -29,6 +29,7 @@
                                COMPRESSED_TENSORS_METHOD, is_310p,
                                is_enable_nz)
 
+
 def quant_per_tensor(in_tensor: torch.Tensor,
                      input_scale: torch.Tensor,
                      input_offset: torch.Tensor,
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
index 559030e4b3d..58ac27a0d27 100644
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -157,8 +157,6 @@ def __init__(
         # FixMe: this is a patch to fix the issue cause by https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170
         from vllm.model_executor.layers.linear import \
             WEIGHT_LOADER_V2_SUPPORTED
-        WEIGHT_LOADER_V2_SUPPORTED.append(
-            "AscendCompressedTensorsLinearMethod")
         if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
             WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
 

From d8b4dbf00372682d9c9aae39a03f95c6f49ac4f9 Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Mon, 24 Nov 2025 09:18:11 +0800
Subject: [PATCH 04/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 vllm_ascend/platform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index a4fec06aa12..7a62b0b946b 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -30,7 +30,7 @@
                                        init_ascend_config)
 from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
                                         delete_torchair_cache_file)
-from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, 
+from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
                                COMPRESSED_TENSORS_METHOD, enable_sp, is_310p,
                                is_vl_model, prefill_context_parallel_enable,
                                update_aclgraph_sizes,

From bb60f2a7dc085c74e6272e8d9e0c530ffc93201b Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Mon, 24 Nov 2025 11:14:11 +0800
Subject: [PATCH 05/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d5d939f6870..3ee659e6b51 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ requires = [
     "quart",
     "numba",
     "opencv-python-headless<=4.11.0.86", # Required to avoid numpy version conflict with vllm
-    "compressed_tensors"
+    "compressed_tensors>=0.11.0"
 ]
 build-backend = "setuptools.build_meta"
 

From 9b402496a1af62ac840654b0202778a30cd220fb Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Mon, 24 Nov 2025 14:09:27 +0800
Subject: [PATCH 06/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 tests/ut/test_platform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py
index f7a6cbd1519..3530185d9a7 100644
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -53,7 +53,7 @@ def test_class_variables(self):
                          "ASCEND_RT_VISIBLE_DEVICES")
         self.assertEqual(NPUPlatform.dispatch_key, "PrivateUse1")
         self.assertEqual(NPUPlatform.supported_quantization,
-                         [ASCEND_QUANTIZATION_METHOD])
+                         [ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD])
 
     def test_is_sleep_mode_available(self):
         self.assertTrue(self.platform.is_sleep_mode_available())

From 7c15337449b0fa0a04206ca09d6e694c9d278be3 Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Mon, 24 Nov 2025 14:41:01 +0800
Subject: [PATCH 07/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 tests/ut/test_platform.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py
index 3530185d9a7..44ac831e6de 100644
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -9,7 +9,8 @@
 
 from tests.ut.base import TestBase
 from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is
+from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
+                               COMPRESSED_TENSORS_METHOD, vllm_version_is)
 
 if vllm_version_is("0.11.0"):
     from vllm.config.compilation import CompilationLevel
@@ -52,8 +53,9 @@ def test_class_variables(self):
         self.assertEqual(NPUPlatform.device_control_env_var,
                          "ASCEND_RT_VISIBLE_DEVICES")
         self.assertEqual(NPUPlatform.dispatch_key, "PrivateUse1")
-        self.assertEqual(NPUPlatform.supported_quantization,
-                         [ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD])
+        self.assertEqual(
+            NPUPlatform.supported_quantization,
+            [ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD])
 
     def test_is_sleep_mode_available(self):
         self.assertTrue(self.platform.is_sleep_mode_available())

From e388e2f829f50a4d1080fca0d7c39eaba039494f Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Mon, 24 Nov 2025 17:32:20 +0800
Subject: [PATCH 08/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 tests/ut/test_platform.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py
index 398ea6371d2..91d30ad9818 100644
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -10,12 +10,7 @@
 from tests.ut.base import TestBase
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
-                               COMPRESSED_TENSORS_METHOD, vllm_version_is)
-
-if vllm_version_is("0.11.0"):
-    from vllm.config.compilation import CompilationLevel
-else:
-    from vllm.config.compilation import CompilationMode
+                               COMPRESSED_TENSORS_METHOD)
 
 
 class TestNPUPlatform(TestBase):

From 7a076814460b01d9f1fb82c09bf3e116adaa348c Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Tue, 25 Nov 2025 16:19:53 +0800
Subject: [PATCH 09/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 examples/quantization/llm-compressor/w8a8_int8.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/quantization/llm-compressor/w8a8_int8.py b/examples/quantization/llm-compressor/w8a8_int8.py
index f5e812c12df..cfd75b916c7 100644
--- a/examples/quantization/llm-compressor/w8a8_int8.py
+++ b/examples/quantization/llm-compressor/w8a8_int8.py
@@ -53,11 +53,11 @@
 
 def load_environment_variables():
     env_vars = {
-        'model_path': "Qwen3-32B",
+        'model_path': "Qwen/Qwen3-32B",
         'export_path': "/llm-compressor/export/GPTQ/W8A8_W_cha_A_ten_static_symmetric",
         'modifier': "GPTQ",
         'schemes': "W8A8_W_cha_A_ten_static_symmetric",
-        'calib_prompt_path': "dataset/ultrachat_200k"
+        'calib_prompt_path': "HuggingFaceH4/ultrachat_200k"
     }
 
     # verify export model path

From 7c1848eed9ab566fb8e52f05939eb20b2f15ecff Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Tue, 25 Nov 2025 17:41:03 +0800
Subject: [PATCH 10/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 examples/quantization/llm-compressor/w8a8_int8.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/quantization/llm-compressor/w8a8_int8.py b/examples/quantization/llm-compressor/w8a8_int8.py
index cfd75b916c7..9a6cb392f0c 100644
--- a/examples/quantization/llm-compressor/w8a8_int8.py
+++ b/examples/quantization/llm-compressor/w8a8_int8.py
@@ -151,10 +151,6 @@ def save_quantized_model(model, tokenizer, save_path, save_compressed=False):
     )
     tokenizer = TOKENIZER_DICT[model_type].from_pretrained(env_vars['model_path'], trust_remote_code=True)
 
-    # Load the calibration dataset
-    if env_vars["calib_prompt_path"] is None:
-        env_vars["calib_prompt_path"] = "dataset/ultrachat_200k"
-
     ds = load_calibration_text_dataset(env_vars["calib_prompt_path"], tokenizer)
 
     # Quantize the model

From 85283b1fd536955fad55775f7f296bf164f04a44 Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Tue, 25 Nov 2025 21:11:36 +0800
Subject: [PATCH 11/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 tests/e2e/multicard/test_quantization.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/e2e/multicard/test_quantization.py b/tests/e2e/multicard/test_quantization.py
index f14fd15b3cc..aaeb62401e5 100644
--- a/tests/e2e/multicard/test_quantization.py
+++ b/tests/e2e/multicard/test_quantization.py
@@ -31,10 +31,8 @@ def test_models_distributed_quantized_W8A8():
     ]
     max_tokens = 5
     with VllmRunner(snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
-                    tensor_parallel_size=4,
+                    tensor_parallel_size=2,
                     max_model_len=4096,
                     gpu_memory_utilization=0.8,
-                    distributed_executor_backend="mp",
-                    enforce_eager=True) as vllm_model:
+                    enforce_eager=False) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
-        del vllm_model

From 24f6596d92e27e9f3740b2f53009aa4248e45ecc Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Tue, 25 Nov 2025 21:12:47 +0800
Subject: [PATCH 12/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 .github/workflows/_e2e_test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index ea56813c0a9..272116aa3d0 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -182,6 +182,7 @@ jobs:
           VLLM_USE_MODELSCOPE: True
         if: ${{ inputs.type == 'full' }}
         run: |
+          pytest -sv tests/e2e/multicard/test_quantization.py
           pytest -sv tests/e2e/multicard/test_aclgraph_capture_replay.py
           pytest -sv tests/e2e/multicard/test_torchair_graph_mode.py
           pytest -sv tests/e2e/multicard/test_full_graph_mode.py
@@ -207,7 +208,6 @@ jobs:
           pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
           pytest -sv tests/e2e/multicard/test_prefix_caching.py
           pytest -sv tests/e2e/multicard/test_qwen3_moe.py
-          pytest -sv tests/e2e/multicard/test_quantization.py
 
   e2e-4-cards:
     name: multicard-4

From 8c59b6c85929708adf99858e3aa6b5e4fbd4bbd8 Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Wed, 26 Nov 2025 14:32:12 +0800
Subject: [PATCH 13/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 .../feature_guide/quantization-llm-compressor.md     | 12 +++++++++++-
 tests/e2e/multicard/test_quantization.py             | 10 +++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
index c523ac4ff72..df6f489035a 100644
--- a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
+++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
@@ -1,7 +1,17 @@
-# Quantization Guide
+# llm-compressor Quantization Guide
 
 Model quantization is a technique that reduces the size and computational requirements of a model by lowering the data precision of the weights and activation values in the model, thereby saving the memory and improving the inference speed.
 
+## Supported llm-compressor Quantization Types
+
+Support CompressedTensorsW8A8 static weight.
+
+weight: per-channel, int8, symmetric; activation: per-tensor, int8, symmetric.
+
+Support CompressedTensorsW8A8Dynamic weight.
+
+weight: per-channel, int8, symmetric; activation: per-token, int8, symmetric, dynamic.
+
 ## Install llm-compressor
 
 To quantize a model, you should install [llm-compressor](https://github.com/vllm-project/llm-compressor/blob/main/README.md). It is a unified library for creating compressed models for faster inference with vLLM.
diff --git a/tests/e2e/multicard/test_quantization.py b/tests/e2e/multicard/test_quantization.py
index aaeb62401e5..ecb2f49575f 100644
--- a/tests/e2e/multicard/test_quantization.py
+++ b/tests/e2e/multicard/test_quantization.py
@@ -35,4 +35,12 @@ def test_models_distributed_quantized_W8A8():
                     max_model_len=4096,
                     gpu_memory_utilization=0.8,
                     enforce_eager=False) as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    golden_results = [
+        'Hello, my name is the head of state and',
+    ]
+
+    for i in range(len(vllm_output)):
+        assert golden_results[i] == vllm_output[i][1]
+        print(f"Generated text: {vllm_output[i][1]!r}")

From 7c01955332aa64ea7584aaf8d7cb80d2471e6985 Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Wed, 26 Nov 2025 15:14:06 +0800
Subject: [PATCH 14/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 vllm_ascend/platform.py          | 5 ++---
 vllm_ascend/quantization/w8a8.py | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 9a57ed63230..5140d697f15 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -30,9 +30,8 @@
                                        init_ascend_config)
 from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
                                         delete_torchair_cache_file)
-from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, 
-                               COMPRESSED_TENSORS_METHOD,
-                               AscendDeviceType,
+from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
+                               COMPRESSED_TENSORS_METHOD, AscendDeviceType,
                                enable_sp, get_ascend_device_type, is_vl_model,
                                prefill_context_parallel_enable,
                                update_aclgraph_sizes,
diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py
index b109716e0af..8a7bbfe7263 100644
--- a/vllm_ascend/quantization/w8a8.py
+++ b/vllm_ascend/quantization/w8a8.py
@@ -25,9 +25,9 @@
 
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts
-from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType,
-                               get_ascend_device_type, is_enable_nz,
-                               COMPRESSED_TENSORS_METHOD)
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
+                               COMPRESSED_TENSORS_METHOD, AscendDeviceType,
+                               get_ascend_device_type, is_enable_nz)
 
 
 def quant_per_tensor(in_tensor: torch.Tensor,
@@ -153,7 +153,7 @@ def apply(
         if getattr(layer, "ascend_quant_method",
                    "") == COMPRESSED_TENSORS_METHOD:
             quant_bias = bias
-            
+
         if get_ascend_device_type() == AscendDeviceType._310P:
             # On 300I Duo platform, we need transpose again if
             # using nz. This transpose can be skipped in torchair.

From 2f24a00758e351371948b89469e74465b03790d5 Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Wed, 26 Nov 2025 15:21:18 +0800
Subject: [PATCH 15/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 tests/e2e/multicard/test_quantization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/multicard/test_quantization.py b/tests/e2e/multicard/test_quantization.py
index ecb2f49575f..67c57daf09e 100644
--- a/tests/e2e/multicard/test_quantization.py
+++ b/tests/e2e/multicard/test_quantization.py
@@ -27,7 +27,7 @@
 
 def test_models_distributed_quantized_W8A8():
     example_prompts = [
-        "Hello, my name is",
+        "The president of the United States is",
     ]
     max_tokens = 5
     with VllmRunner(snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
@@ -38,7 +38,7 @@ def test_models_distributed_quantized_W8A8():
         vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     golden_results = [
-        'Hello, my name is the head of state and',
+        'The president of the United States is the head of state and',
     ]
 
     for i in range(len(vllm_output)):

From e7e110079359c3b7f8a2280f08e608a5f224aec8 Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Wed, 26 Nov 2025 17:07:36 +0800
Subject: [PATCH 16/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 vllm_ascend/platform.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 5140d697f15..9e8b2593109 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -30,13 +30,13 @@
                                        init_ascend_config)
 from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
                                         delete_torchair_cache_file)
-from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
-                               COMPRESSED_TENSORS_METHOD, AscendDeviceType,
-                               enable_sp, get_ascend_device_type, is_vl_model,
-                               prefill_context_parallel_enable,
-                               update_aclgraph_sizes,
-                               update_cudagraph_capture_sizes,
-                               update_default_aclgraph_sizes)
+
+# isort: off
+from vllm_ascend.utils import (
+    ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD, AscendDeviceType,
+    enable_sp, get_ascend_device_type, is_vl_model,
+    prefill_context_parallel_enable, update_aclgraph_sizes,
+    update_cudagraph_capture_sizes, update_default_aclgraph_sizes)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig

From 14354e9bba70673a5c035838396c97fde378b909 Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Thu, 27 Nov 2025 09:20:58 +0800
Subject: [PATCH 17/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 .../user_guide/feature_guide/quantization-llm-compressor.md     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
index df6f489035a..bd856d5f586 100644
--- a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
+++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
@@ -16,7 +16,7 @@ weight: per-channel, int8, symmetric; activation: per-token, int8, symmetric, dy
 
 To quantize a model, you should install [llm-compressor](https://github.com/vllm-project/llm-compressor/blob/main/README.md). It is a unified library for creating compressed models for faster inference with vLLM.
 
-Install llm-compressor:
+Install llm-compressor
 
 ```bash
 pip install llmcompressor

From b62bf8c253a59746439bfd2b6f31bb2ac5799c4f Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Thu, 27 Nov 2025 21:06:11 +0800
Subject: [PATCH 18/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 .../user_guide/feature_guide/quantization-llm-compressor.md   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
index bd856d5f586..b0c543f47a9 100644
--- a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
+++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
@@ -4,11 +4,11 @@ Model quantization is a technique that reduces the size and computational requir
 
 ## Supported llm-compressor Quantization Types
 
-Support CompressedTensorsW8A8 static weight.
+Support CompressedTensorsW8A8 static weight:
 
 weight: per-channel, int8, symmetric; activation: per-tensor, int8, symmetric.
 
-Support CompressedTensorsW8A8Dynamic weight.
+Support CompressedTensorsW8A8Dynamic weight:
 
 weight: per-channel, int8, symmetric; activation: per-token, int8, symmetric, dynamic.
 

From 6dbc2b311b39a3eda7d584136474840b5a75facf Mon Sep 17 00:00:00 2001
From: chenxi-hh <chen464822955@163.com>
Date: Fri, 28 Nov 2025 09:07:06 +0800
Subject: [PATCH 19/19] CI problems

Signed-off-by: chenxi-hh <chen464822955@163.com>
---
 .../user_guide/feature_guide/quantization-llm-compressor.md   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
index b0c543f47a9..a97b4de2940 100644
--- a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
+++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
@@ -4,11 +4,11 @@ Model quantization is a technique that reduces the size and computational requir
 
 ## Supported llm-compressor Quantization Types
 
-Support CompressedTensorsW8A8 static weight:
+Support CompressedTensorsW8A8 static weight
 
 weight: per-channel, int8, symmetric; activation: per-tensor, int8, symmetric.
 
-Support CompressedTensorsW8A8Dynamic weight:
+Support CompressedTensorsW8A8Dynamic weight
 
 weight: per-channel, int8, symmetric; activation: per-token, int8, symmetric, dynamic.