From b41642affaa2ba35fc7e8ef960464ddb77163f00 Mon Sep 17 00:00:00 2001 From: LHXuuu Date: Thu, 13 Nov 2025 14:14:37 +0800 Subject: [PATCH 01/19] support compressed tensors w8a8 static and dynamic quantization Signed-off-by: LHXuuu --- vllm_ascend/platform.py | 9 +- .../compressed_tensors/__init__.py | 0 .../compressed_tensors/compressed_tensors.py | 300 ++++++++++++++++++ .../compressed_tensors/schemes/__init__.py | 7 + .../schemes/compressed_tensors_w8a8.py | 141 ++++++++ .../compressed_tensors_w8a8_dynamic.py | 89 ++++++ vllm_ascend/quantization/quant_config.py | 3 +- vllm_ascend/utils.py | 1 + vllm_ascend/worker/worker_v1.py | 2 + 9 files changed, 549 insertions(+), 3 deletions(-) create mode 100644 vllm_ascend/quantization/compressed_tensors/__init__.py create mode 100644 vllm_ascend/quantization/compressed_tensors/compressed_tensors.py create mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/__init__.py create mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py create mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 5559df8c02f..8aa1d4b4c0e 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -30,7 +30,8 @@ init_ascend_config) from vllm_ascend.torchair.utils import (check_torchair_cache_exist, delete_torchair_cache_file) -from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p, +from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, + COMPRESSED_TENSORS_METHOD, enable_sp, is_310p, prefill_context_parallel_enable, update_aclgraph_sizes, update_cudagraph_capture_sizes, @@ -55,7 +56,9 @@ class NPUPlatform(Platform): device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES" dispatch_key: str = "PrivateUse1" - supported_quantization: list[str] = [ASCEND_QUANTIZATION_METHOD] + supported_quantization: list[str] = [ + ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD + ] def is_sleep_mode_available(self) -> bool: return True @@ -78,6 +81,8 @@ def pre_register_and_update(cls, if ASCEND_QUANTIZATION_METHOD not in quant_action.choices: quant_action.choices.append(ASCEND_QUANTIZATION_METHOD) + from vllm_ascend.quantization.compressed_tensors.compressed_tensors import \ + AscendCompressedTensorsConfig # noqa: F401 from vllm_ascend.quantization.quant_config import \ AscendQuantConfig # noqa: F401 diff --git a/vllm_ascend/quantization/compressed_tensors/__init__.py b/vllm_ascend/quantization/compressed_tensors/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py new file mode 100644 index 00000000000..7938d910da3 --- /dev/null +++ b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py @@ -0,0 +1,300 @@ +from typing import TYPE_CHECKING, Any, Optional, cast + +import torch +from compressed_tensors.quantization import (QuantizationArgs, + QuantizationStrategy) +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import ( + QUANTIZATION_METHODS, register_quantization_config) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import \ + CompressedTensorsScheme +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + find_matched_target, is_activation_quantization_format, + should_ignore_layer) + +from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD + +from .schemes.compressed_tensors_w8a8 import CompressedTensorsW8A8 +from .schemes.compressed_tensors_w8a8_dynamic import \ + CompressedTensorsW8A8Dynamic + +if TYPE_CHECKING: + from vllm.model_executor.models.utils import WeightsMapper + +logger = init_logger(__name__) + +QUANTIZATION_SCHEME_MAP_TYPE = dict[str, Optional[dict[str, QuantizationArgs]]] + + +def remove_quantization_method(): + if COMPRESSED_TENSORS_METHOD in QUANTIZATION_METHODS: + QUANTIZATION_METHODS.remove(COMPRESSED_TENSORS_METHOD) + + +remove_quantization_method() + + +@register_quantization_config(COMPRESSED_TENSORS_METHOD) +class AscendCompressedTensorsConfig(QuantizationConfig): + + def __init__( + self, + target_scheme_map: dict[str, Any], + ignore: list[str], + quant_format: str, + config: Optional[dict[str, Any]] = None, + ): + super().__init__() + self.ignore = ignore + self.quant_format = quant_format + # Map from [target -> scheme] + self.target_scheme_map = target_scheme_map + self.quant_description = config + + def get_name(self) -> str: + return "compressed-tensors" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.int8, torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + raise NotImplementedError( + "Ascend hardware dose not support \"get_min_capability\" feature.") + + @classmethod + def get_config_filenames(cls) -> list[str]: + return [] + + @classmethod + def from_config(cls, config: dict[str, + Any]) -> "AscendCompressedTensorsConfig": + ignore: list[str] = cast(list[str], config.get("ignore", [])) + quant_format = cast(str, config.get("format")) + target_scheme_map = cls._quantization_scheme_map_from_config( + config=config) + + return cls( + target_scheme_map=target_scheme_map, + ignore=ignore, + quant_format=quant_format, + config=config, + ) + + @classmethod + def _quantization_scheme_map_from_config( + cls, config: dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE: + """ + :param config: The `quantization_config` dictionary from config.json + :return: A dictionary mapping target layer names to their corresponding + quantization_args for weights and input activations + """ + target_scheme_map: dict[str, Any] = dict() + quant_format = cast(str, config.get("format")) + + # The quant_config has multiple config_groups, each containing + # an input_activations key with details about how the activations are + # quantized, a weights key indicating how the weights are quantized, + # and a list of targets under the `targets` key, dictating which + # layers are impacted by the quantization details. The quantization + # details follow the structure defined by the QuantizationArgs + # pydantic model, which is used to verify the structure of the + # quant_config and also store the details for later use. + + config_groups = config.get("config_groups", dict()) + for _, quant_config in config_groups.items(): + targets = quant_config.get("targets") + for target in targets: + target_scheme_map[target] = {} + target_scheme_map[target][ + "weights"] = QuantizationArgs.model_validate( + quant_config.get("weights")) + + target_scheme_map[target]["input_activations"] = None + target_scheme_map[target]["format"] = quant_config.get( + "format") + format = target_scheme_map[target].get("format") + # If no per-config format defined, use global format in config + act_quant_format = ( + is_activation_quantization_format(format) + if format is not None else + is_activation_quantization_format(quant_format)) + input_activations = quant_config.get("input_activations") + if act_quant_format and input_activations is not None: + target_scheme_map[target]["input_activations"] = ( + QuantizationArgs.model_validate( + quant_config.get("input_activations"))) + return target_scheme_map + + def get_quant_method( + self, + layer: torch.nn.Module, + prefix: str, + ) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase): + # collect schemes + quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) + + # choose quantization method + quant_method: LinearMethodBase = UnquantizedLinearMethod() + if quant_scheme is not None: + layer.scheme = quant_scheme + quant_method = AscendCompressedTensorsLinearMethod(self) + return quant_method + return None + + def get_scheme(self, + layer: torch.nn.Module, + layer_name: Optional[str] = None + ) -> Optional["CompressedTensorsScheme"]: + """ + compressed-tensors supports non uniform in the following way: + + targets of config_groups: There can be N config_groups which each + have a quantization scheme. Each config_group has a list of targets + which can be a full layer_name, a regex for a layer_name, or + an nn.Module name. + + Detect whether a layer_name is found in any target and + use the quantization scheme corresponding to the matched target + to select the CompressedTensorsScheme used for inference. + """ + + # Find the "target" in the compressed-tensors config + # that our layer conforms to. + if should_ignore_layer(layer_name, + ignore=self.ignore, + fused_mapping=self.packed_modules_mapping): + return None + + # Will be empty for models with only sparsity + weight_quant = input_quant = None + if self.target_scheme_map: + matched_target = find_matched_target( + layer_name=layer_name, + module=layer, + targets=self.target_scheme_map.keys(), + fused_mapping=self.packed_modules_mapping, + ) + + scheme_dict = self.target_scheme_map[matched_target] + weight_quant = scheme_dict.get("weights") + input_quant = scheme_dict.get("input_activations") + + if weight_quant is None: + logger.warning_once("Acceleration for non-quantized schemes is " + "not supported by Compressed Tensors. " + "Falling back to UnquantizedLinearMethod") + return None + + else: + # Find the quant_scheme + scheme = self._get_scheme_from_parts( + weight_quant=weight_quant, + input_quant=input_quant, + ) + return scheme + + def _get_scheme_from_parts( + self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> "CompressedTensorsScheme": + act_quant_format = is_activation_quantization_format(self.quant_format) + if act_quant_format and input_quant is not None: + if self._is_static_tensor_w8a8(weight_quant, input_quant): + return CompressedTensorsW8A8() + + if self._is_dynamic_token_w8a8(weight_quant, input_quant): + return CompressedTensorsW8A8Dynamic() + + raise NotImplementedError( + "No compressed-tensors compatible scheme was found.") + + def _is_static_tensor_w8a8(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: + is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.CHANNEL.value) + is_tensor = (weight_strategy and input_quant.strategy + == QuantizationStrategy.TENSOR.value) + is_static = not weight_quant.dynamic and not input_quant.dynamic + is_symmetric = weight_quant.symmetric and input_quant.symmetric + + # Only symmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_8_bits and is_tensor and is_symmetric and is_static + + def _is_dynamic_token_w8a8(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: + is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.CHANNEL.value) + is_token = (weight_strategy and input_quant.strategy + == QuantizationStrategy.TOKEN.value) + is_dynamic = not weight_quant.dynamic and input_quant.dynamic + is_symmetric = weight_quant.symmetric and input_quant.symmetric + + # Only symmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_8_bits and is_token and is_symmetric and is_dynamic + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + self.target_scheme_map = hf_to_vllm_mapper.apply_dict( + self.target_scheme_map) + self.ignore = hf_to_vllm_mapper.apply_list(self.ignore) + + +class AscendCompressedTensorsLinearMethod(LinearMethodBase): + + def __init__(self, quantization_config: AscendCompressedTensorsConfig): + self.quantization_config = quantization_config + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.scheme.process_weights_after_loading(layer) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + """ + Use the CompressedTensorsScheme associated with each layer to create + the necessary parameters for the layer. See LinearMethodBase for param + details + """ + weight_loader = extra_weight_attrs.get("weight_loader") + layer.scheme.create_weights( + layer=layer, + input_size=input_size, + input_size_per_partition=input_size_per_partition, + output_partition_sizes=output_partition_sizes, + output_size=output_size, + params_dtype=params_dtype, + weight_loader=weight_loader, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ): + """ + Use the output of create_weights and the CompressedTensorsScheme + associated with the layer to apply the forward pass with the + layer input. See LinearMethodBase for param details + + """ + scheme = layer.scheme + if scheme is None: + raise ValueError("A scheme must be defined for each layer") + return scheme.apply_weights(layer, x, bias=bias) diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py b/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py new file mode 100644 index 00000000000..7f334daf711 --- /dev/null +++ b/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from .compressed_tensors_w8a8 import CompressedTensorsW8A8 +from .compressed_tensors_w8a8_dynamic import CompressedTensorsW8A8Dynamic + +__all__ = ["CompressedTensorsW8A8", "CompressedTensorsW8A8Dynamic"] \ No newline at end of file diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py new file mode 100644 index 00000000000..e9c64509b0e --- /dev/null +++ b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py @@ -0,0 +1,141 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List, Optional + +import torch +import torch_npu +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import \ + CompressedTensorsScheme +from vllm.model_executor.parameter import (ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter) + +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, is_enable_nz + +logger = init_logger(__name__) + + +def quant_per_tensor(in_tensor: torch.Tensor, + input_scale: torch.Tensor, + input_offset: torch.Tensor, + function=False): + return torch_npu.npu_quantize(in_tensor, input_scale, input_offset, + torch.qint8, -1, function) + + +class CompressedTensorsW8A8(CompressedTensorsScheme): + + def __init__(self) -> None: + # aclnn quant matmul requires to transpose matrix B, set to true by default. + self.transpose_weight = not is_310p() + + @classmethod + def get_min_capability(cls) -> int: + raise NotImplementedError( + "Ascend hardware dose not support \"get_min_capability\" feature.") + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + self.output_partition_sizes = output_partition_sizes + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + + # WEIGHT + weight = ModelWeightParameter( + data=torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=torch.int8), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((output_size_per_partition, 1), + dtype=params_dtype), + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + input_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=params_dtype), + weight_loader=weight_loader, + ) + input_scale[:] = torch.finfo(params_dtype).min + layer.register_parameter("input_scale", input_scale) + + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, + bias: Optional[torch.Tensor]) -> torch.Tensor: + if x.dtype != torch.int8: + x = quant_per_tensor( + x, + layer.aclnn_input_scale_reciprocal, + None, + ) + + if is_310p(): + # On 300I Duo platform, we need transpose again if + # using nz. This transpose can be skipped in torchair. + output = torch_npu.npu_quant_matmul( + x, + layer.weight.data.transpose(1, 0), + layer.deq_scale, + bias=bias, + output_dtype=layer.params_dtype, + ) + else: + output = torch_npu.npu_quant_matmul( + x, + layer.weight, + layer.deq_scale, + bias=bias, + output_dtype=layer.params_dtype, + ) + return output + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.input_scale = torch.nn.Parameter(layer.input_scale.max(), + requires_grad=False) + expanding_factor = layer.weight.data.shape[1] + layer.aclnn_input_scale = torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor), + requires_grad=False) + layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor), + requires_grad=False) + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + if is_enable_nz(): + layer.weight.data = torch_npu.npu_format_cast( + layer.weight.data, ACL_FORMAT_FRACTAL_NZ) + layer.weight_scale.data = torch.flatten(layer.weight_scale.data) + deq_scale = layer.input_scale.data * layer.weight_scale.data + layer.deq_scale = torch.nn.Parameter(deq_scale, requires_grad=False) diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py new file mode 100644 index 00000000000..58dcc010cff --- /dev/null +++ b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import List, Optional + +import torch +import torch_npu +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import \ + CompressedTensorsScheme +from vllm.model_executor.parameter import (ChannelQuantScaleParameter, + ModelWeightParameter) + +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz + +logger = init_logger(__name__) + + +class CompressedTensorsW8A8Dynamic(CompressedTensorsScheme): + + def __init__(self) -> None: + # aclnn quant matmul requires to transpose matrix B, set to true by default. + self.transpose_weight = True + + @classmethod + def get_min_capability(cls) -> int: + raise NotImplementedError( + "Ascend hardware dose not support \"get_min_capability\" feature.") + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + + # WEIGHT + weight = ModelWeightParameter( + data=torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=torch.int8), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((output_size_per_partition, 1), + dtype=params_dtype), + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight_scale", weight_scale) + + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, + bias: Optional[torch.Tensor]) -> torch.Tensor: + if not isinstance(x, tuple): + output_dtype = x.dtype + quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x) + else: + output_dtype = layer.weight_scale.dtype + quantized_x, dynamic_scale = x + + output = torch_npu.npu_quant_matmul( + quantized_x, + layer.weight, + layer.weight_scale, + pertoken_scale=dynamic_scale, + bias=bias, + output_dtype=output_dtype, + ) + return output + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + # cast quantized weight tensors in NZ format for higher inference speed + if is_enable_nz(): + layer.weight.data = torch_npu.npu_format_cast( + layer.weight.data, ACL_FORMAT_FRACTAL_NZ) + layer.weight_scale.data = layer.weight_scale.data.flatten() diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index c0760c800ed..86eff0f9e6a 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -94,7 +94,8 @@ def from_config(cls, config: Dict[str, Any]) -> "AscendQuantConfig": @classmethod def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]: - if torch.npu.is_available(): + quant_method = hf_quant_cfg.get("quant_method", None) + if quant_method is None and torch.npu.is_available(): return ASCEND_QUANTIZATION_METHOD return None diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 381510809a8..3952f9672ed 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -41,6 +41,7 @@ VllmConfig = None ASCEND_QUANTIZATION_METHOD = "ascend" +COMPRESSED_TENSORS_METHOD = "compressed-tensors" SOC_VERSION_INFERENCE_SERIES = ["Ascend310P3"] REGISTERED_ASCEND_OPS = {} diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 58ac27a0d27..559030e4b3d 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -157,6 +157,8 @@ def __init__( # FixMe: this is a patch to fix the issue cause by https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170 from vllm.model_executor.layers.linear import \ WEIGHT_LOADER_V2_SUPPORTED + WEIGHT_LOADER_V2_SUPPORTED.append( + "AscendCompressedTensorsLinearMethod") if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED: WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod") From d7133b32ba9a55e4bf9c377480ab1cb097ec4e01 Mon Sep 17 00:00:00 2001 From: LHXuuu Date: Thu, 20 Nov 2025 14:50:08 +0800 Subject: [PATCH 02/19] Refactoring the quantization functionality to enable LLM Compressor to reuse ModelSlim code for quantization Signed-off-by: LHXuuu --- docs/source/user_guide/feature_guide/index.md | 1 + .../quantization-llm-compressor.md | 56 ++++++ .../quantization/llm-compressor/w8a8_int8.py | 181 ++++++++++++++++++ .../llm-compressor/w8a8_int8_dynamic.py | 83 ++++++++ pyproject.toml | 1 + requirements.txt | 1 + .../compressed_tensors/compressed_tensors.py | 18 +- .../compressed_tensors/schemes/__init__.py | 7 - .../schemes/compressed_tensors_w8a8.py | 141 -------------- .../compressed_tensors_w8a8_dynamic.py | 89 --------- vllm_ascend/quantization/quant_config.py | 33 ++-- vllm_ascend/quantization/utils.py | 27 ++- vllm_ascend/quantization/w8a8.py | 14 +- 13 files changed, 392 insertions(+), 260 deletions(-) create mode 100644 docs/source/user_guide/feature_guide/quantization-llm-compressor.md create mode 100644 examples/quantization/llm-compressor/w8a8_int8.py create mode 100644 examples/quantization/llm-compressor/w8a8_int8_dynamic.py delete mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/__init__.py delete mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py delete mode 100644 vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py diff --git a/docs/source/user_guide/feature_guide/index.md b/docs/source/user_guide/feature_guide/index.md index b0c0fd7d462..3fa4f8f995a 100644 --- a/docs/source/user_guide/feature_guide/index.md +++ b/docs/source/user_guide/feature_guide/index.md @@ -7,6 +7,7 @@ This section provides a detailed usage guide of vLLM Ascend features. :maxdepth: 1 graph_mode quantization +quantization-llm-compressor sleep_mode structured_output lora diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md new file mode 100644 index 00000000000..7fad89589d1 --- /dev/null +++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md @@ -0,0 +1,56 @@ +# Quantization Guide + +Model quantization is a technique that reduces the size and computational requirements of a model by lowering the data precision of the weights and activation values in the model, thereby saving the memory and improving the inference speed. + +## Install llm-compressor + +To quantize a model, you should install [llm-compressor](https://github.com/vllm-project/llm-compressor/blob/main/README.md). It is a unified library for creating compressed models for faster inference with vLLM. + +Install llm-compressor: + +```bash +pip install llmcompressor +``` + +### Generate the W8A8 weights + +```bash +cd examples/quantization/llm-compressor + +python3 w8a8_int8_dynamic.py +``` + +for more details, see the [Official Sample](https://github.com/vllm-project/llm-compressor/tree/main/examples). + +## Run the model + +Now, you can run the quantized model with vLLM Ascend. Examples for online and offline inference are provided as follows: + +### Offline inference + +```python +import torch + +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40) + +llm = LLM(model="{quantized_model_save_path}", + max_model_len=2048, + trust_remote_code=True) + +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +### Online inference + +Start the quantized model using vLLM Ascend; no modifications to the startup command are required. + diff --git a/examples/quantization/llm-compressor/w8a8_int8.py b/examples/quantization/llm-compressor/w8a8_int8.py new file mode 100644 index 00000000000..80899ba7220 --- /dev/null +++ b/examples/quantization/llm-compressor/w8a8_int8.py @@ -0,0 +1,181 @@ +import os +import torch + +from datasets import load_dataset +from transformers import AutoModelForCausalLM, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, \ + AutoTokenizer, AutoProcessor, AutoConfig, AutoImageProcessor + +from llmcompressor import oneshot +from llmcompressor.modifiers.awq import AWQModifier +from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier +from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme, QuantizationType, QuantizationStrategy +from qwen_vl_utils import process_vision_info + +W8A8_W_cha_A_ten_static_symmetric = { + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs( + num_bits=8, + type=QuantizationType.INT, + strategy=QuantizationStrategy.CHANNEL, + symmetric=True, + dynamic=False + ), + input_activations=QuantizationArgs( + num_bits=8, + type=QuantizationType.INT, + strategy=QuantizationStrategy.TENSOR, + symmetric=True, + dynamic=False + ), + ), +} + +# supported modifiers +MODIFIER_DICT = { + "PTQ": QuantizationModifier, + "AWQ": AWQModifier, + "GPTQ": GPTQModifier, +} + +# supported schemes +SCHEMES_DICT = { + "W8A8_W_cha_A_ten_static_symmetric": W8A8_W_cha_A_ten_static_symmetric, +} + +MODEL_DICT = { + "qwen3": AutoModelForCausalLM, +} + +TOKENIZER_DICT = { + "qwen3": AutoTokenizer, +} + + +def load_environment_variables(): + env_vars = { + 'model_path': os.getenv('MODEL_PATH'), + 'export_path': os.getenv('EXPORT_PATH'), + 'modifier': os.getenv('MODIFIER'), + 'schemes': os.getenv('SCHEMES'), + 'calib_prompt_path': os.getenv('CALIB_PROMPT_PATH') + } + + # verify export model path + if env_vars['export_path'] is None: + env_vars['export_path'] = env_vars['model_path'].rstrip("/") + "-" + env_vars['modifier'] + if env_vars['schemes'] is not None: + env_vars['export_path'] += "-" + env_vars['schemes'] + os.makedirs(env_vars['export_path'], exist_ok=True) + + return env_vars + + +def load_calibration_text_dataset(calib_prompt_path, tokenizer): + # Load dataset + for f in os.listdir(calib_prompt_path): + print(f) + if any(f.lower().endswith('.jsonl') for f in os.listdir(calib_prompt_path)): + ds = load_dataset('json', data_dir=calib_prompt_path, split='validation') + elif any(f.lower().endswith('.parquet') for f in os.listdir(calib_prompt_path)): + ds = load_dataset("parquet", data_dir=calib_prompt_path, split="train[:512]") + else: + raise ValueError("Unsupported calibration file format: {}".format( + calib_prompt_path.split('.')[-1])) + + # Preprocess dataset + def preprocess(example): + if tokenizer.chat_template is not None: + return {"text": tokenizer.apply_chat_template( + example["messages"], tokenize=False)} + else: + return {"text": example["messages"]} + + # Tokenize inputs + def tokenize(sample): + return tokenizer( + sample["text"], + add_special_tokens=False, + ) + + ds = ds.map(preprocess) + ds = ds.map(tokenize, remove_columns=ds.column_names) + return ds + + +# Define a oneshot data collator for multimodal inputs. +def data_collator(batch): + assert len(batch) == 1 + return { + key: torch.tensor(value, dtype=torch.bfloat16 if key == "pixel_values" else torch.long) + for key, value in batch[0].items() + } + + +def quantize_model(model, config, env_vars, is_vl_model, dataset_dict=None): + # since the MoE gate layers are sensitive to quantization, we add them to the ignore + # list so they remain at full precision + ignore = ["lm_head", "re:.*mlp.down_proj"] + + # define a llmcompressor recipe + recipe = [ + MODIFIER_DICT[env_vars['modifier']]( + config_groups=SCHEMES_DICT[env_vars['schemes']], + ignore=ignore, + ), + ] + + if env_vars['modifier'] == 'PTQ': + oneshot( + model=model, + recipe=recipe, + trust_remote_code_model=True, + ) + elif is_vl_model: + # quantize the model + oneshot( + model=model, + dataset=dataset_dict, + recipe=recipe, + data_collator=data_collator, + trust_remote_code_model=True, + ) + else: + # quantize the model + oneshot( + model=model, + dataset=dataset_dict, + recipe=recipe, + trust_remote_code_model=True, + ) + + +def save_quantized_model(model, tokenizer, save_path, save_compressed=False): + model.save_pretrained(save_path, save_compressed=save_compressed) + tokenizer.save_pretrained(save_path) + + +if __name__ == '__main__': + # get environment variables + env_vars = load_environment_variables() + + # support model type list + config = AutoConfig.from_pretrained(env_vars['model_path'], trust_remote_code=True) + model_type = config.model_type + + model = MODEL_DICT[model_type].from_pretrained( + env_vars['model_path'], torch_dtype="auto", trust_remote_code=True + ) + tokenizer = TOKENIZER_DICT[model_type].from_pretrained(env_vars['model_path'], trust_remote_code=True) + + # Load the calibration dataset + if env_vars["calib_prompt_path"] is None: + env_vars["calib_prompt_path"] = "dataset/ultrachat_200k" + + ds = load_calibration_text_dataset(env_vars["calib_prompt_path"], tokenizer) + + # Quantize the model + quantize_model(model, config, env_vars, is_vl_model, ds) + + # save the quantized model + save_quantized_model(model, tokenizer, env_vars['export_path'], True) \ No newline at end of file diff --git a/examples/quantization/llm-compressor/w8a8_int8_dynamic.py b/examples/quantization/llm-compressor/w8a8_int8_dynamic.py new file mode 100644 index 00000000000..1cc9d21c663 --- /dev/null +++ b/examples/quantization/llm-compressor/w8a8_int8_dynamic.py @@ -0,0 +1,83 @@ +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.modifiers.smoothquant import SmoothQuantModifier +from llmcompressor.utils import dispatch_for_generation + +# Select model and load it. +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" + +# Select number of samples. 512 samples is a good place to start. +# Increasing the number of samples can improve accuracy. +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# Configure algorithms. In this case, we: +# * apply SmoothQuant to make the activations easier to quantize +# * quantize the weights to int8 with GPTQ (static per channel) +# * quantize the activations to int8 (dynamic per token) +recipe = [ + SmoothQuantModifier(smoothing_strength=0.8), + GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), +] + +# Apply algorithms and save to output_dir +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Confirm generations of the quantized model look sane. +print("\n\n") +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("npu") +output = model.generate(input_ids, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic-Per-Token" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 7f90b1edb4e..d5d939f6870 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ requires = [ "quart", "numba", "opencv-python-headless<=4.11.0.86", # Required to avoid numpy version conflict with vllm + "compressed_tensors" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 936de5f6b31..124be0bee6c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ torchvision wheel pandas-stubs opencv-python-headless<=4.11.0.86 # Required to avoid numpy version conflict with vllm +compressed_tensors # requirements for disaggregated prefill msgpack diff --git a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py index 7938d910da3..cc279cf8b74 100644 --- a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py @@ -16,12 +16,12 @@ find_matched_target, is_activation_quantization_format, should_ignore_layer) +from vllm_ascend.quantization.quant_config import (AscendLinearMethod, + AscendQuantConfig) +from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod +from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD -from .schemes.compressed_tensors_w8a8 import CompressedTensorsW8A8 -from .schemes.compressed_tensors_w8a8_dynamic import \ - CompressedTensorsW8A8Dynamic - if TYPE_CHECKING: from vllm.model_executor.models.utils import WeightsMapper @@ -137,6 +137,7 @@ def get_quant_method( prefix: str, ) -> Optional["QuantizeMethodBase"]: if isinstance(layer, LinearBase): + layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD # collect schemes quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) @@ -144,7 +145,10 @@ def get_quant_method( quant_method: LinearMethodBase = UnquantizedLinearMethod() if quant_scheme is not None: layer.scheme = quant_scheme - quant_method = AscendCompressedTensorsLinearMethod(self) + ascend_quant_config = AscendQuantConfig( + self.quant_description) + quant_method = AscendLinearMethod(ascend_quant_config, + prefix, None, layer) return quant_method return None @@ -206,10 +210,10 @@ def _get_scheme_from_parts( act_quant_format = is_activation_quantization_format(self.quant_format) if act_quant_format and input_quant is not None: if self._is_static_tensor_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8() + return AscendW8A8LinearMethod() if self._is_dynamic_token_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8Dynamic() + return AscendW8A8DynamicLinearMethod() raise NotImplementedError( "No compressed-tensors compatible scheme was found.") diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py b/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py deleted file mode 100644 index 7f334daf711..00000000000 --- a/vllm_ascend/quantization/compressed_tensors/schemes/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from .compressed_tensors_w8a8 import CompressedTensorsW8A8 -from .compressed_tensors_w8a8_dynamic import CompressedTensorsW8A8Dynamic - -__all__ = ["CompressedTensorsW8A8", "CompressedTensorsW8A8Dynamic"] \ No newline at end of file diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py deleted file mode 100644 index e9c64509b0e..00000000000 --- a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py +++ /dev/null @@ -1,141 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import List, Optional - -import torch -import torch_npu -from vllm.logger import init_logger -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import \ - CompressedTensorsScheme -from vllm.model_executor.parameter import (ChannelQuantScaleParameter, - ModelWeightParameter, - PerTensorScaleParameter) - -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, is_enable_nz - -logger = init_logger(__name__) - - -def quant_per_tensor(in_tensor: torch.Tensor, - input_scale: torch.Tensor, - input_offset: torch.Tensor, - function=False): - return torch_npu.npu_quantize(in_tensor, input_scale, input_offset, - torch.qint8, -1, function) - - -class CompressedTensorsW8A8(CompressedTensorsScheme): - - def __init__(self) -> None: - # aclnn quant matmul requires to transpose matrix B, set to true by default. - self.transpose_weight = not is_310p() - - @classmethod - def get_min_capability(cls) -> int: - raise NotImplementedError( - "Ascend hardware dose not support \"get_min_capability\" feature.") - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: List[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ) -> None: - self.output_partition_sizes = output_partition_sizes - output_size_per_partition = sum(output_partition_sizes) - weight_loader = extra_weight_attrs.get("weight_loader") - - # WEIGHT - weight = ModelWeightParameter( - data=torch.empty(output_size_per_partition, - input_size_per_partition, - dtype=torch.int8), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight", weight) - - # WEIGHT SCALE - weight_scale = ChannelQuantScaleParameter( - data=torch.empty((output_size_per_partition, 1), - dtype=params_dtype), - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight_scale", weight_scale) - - # INPUT SCALE - input_scale = PerTensorScaleParameter( - data=torch.empty(len(output_partition_sizes), dtype=params_dtype), - weight_loader=weight_loader, - ) - input_scale[:] = torch.finfo(params_dtype).min - layer.register_parameter("input_scale", input_scale) - - def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - if x.dtype != torch.int8: - x = quant_per_tensor( - x, - layer.aclnn_input_scale_reciprocal, - None, - ) - - if is_310p(): - # On 300I Duo platform, we need transpose again if - # using nz. This transpose can be skipped in torchair. - output = torch_npu.npu_quant_matmul( - x, - layer.weight.data.transpose(1, 0), - layer.deq_scale, - bias=bias, - output_dtype=layer.params_dtype, - ) - else: - output = torch_npu.npu_quant_matmul( - x, - layer.weight, - layer.deq_scale, - bias=bias, - output_dtype=layer.params_dtype, - ) - return output - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - layer.input_scale = torch.nn.Parameter(layer.input_scale.max(), - requires_grad=False) - expanding_factor = layer.weight.data.shape[1] - layer.aclnn_input_scale = torch.nn.Parameter( - layer.input_scale.data.repeat(expanding_factor), - requires_grad=False) - layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter( - layer.input_scale.data.repeat(expanding_factor), - requires_grad=False) - if self.transpose_weight: - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - if is_enable_nz(): - layer.weight.data = torch_npu.npu_format_cast( - layer.weight.data, ACL_FORMAT_FRACTAL_NZ) - layer.weight_scale.data = torch.flatten(layer.weight_scale.data) - deq_scale = layer.input_scale.data * layer.weight_scale.data - layer.deq_scale = torch.nn.Parameter(deq_scale, requires_grad=False) diff --git a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py b/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py deleted file mode 100644 index 58dcc010cff..00000000000 --- a/vllm_ascend/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamic.py +++ /dev/null @@ -1,89 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional - -import torch -import torch_npu -from vllm.logger import init_logger -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import \ - CompressedTensorsScheme -from vllm.model_executor.parameter import (ChannelQuantScaleParameter, - ModelWeightParameter) - -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz - -logger = init_logger(__name__) - - -class CompressedTensorsW8A8Dynamic(CompressedTensorsScheme): - - def __init__(self) -> None: - # aclnn quant matmul requires to transpose matrix B, set to true by default. - self.transpose_weight = True - - @classmethod - def get_min_capability(cls) -> int: - raise NotImplementedError( - "Ascend hardware dose not support \"get_min_capability\" feature.") - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: List[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ) -> None: - output_size_per_partition = sum(output_partition_sizes) - weight_loader = extra_weight_attrs.get("weight_loader") - - # WEIGHT - weight = ModelWeightParameter( - data=torch.empty(output_size_per_partition, - input_size_per_partition, - dtype=torch.int8), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight", weight) - - # WEIGHT SCALE - weight_scale = ChannelQuantScaleParameter( - data=torch.empty((output_size_per_partition, 1), - dtype=params_dtype), - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight_scale", weight_scale) - - def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - if not isinstance(x, tuple): - output_dtype = x.dtype - quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x) - else: - output_dtype = layer.weight_scale.dtype - quantized_x, dynamic_scale = x - - output = torch_npu.npu_quant_matmul( - quantized_x, - layer.weight, - layer.weight_scale, - pertoken_scale=dynamic_scale, - bias=bias, - output_dtype=output_dtype, - ) - return output - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - if self.transpose_weight: - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - # cast quantized weight tensors in NZ format for higher inference speed - if is_enable_nz(): - layer.weight.data = torch_npu.npu_format_cast( - layer.weight.data, ACL_FORMAT_FRACTAL_NZ) - layer.weight_scale.data = layer.weight_scale.data.flatten() diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 86eff0f9e6a..383488f7f94 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -114,7 +114,7 @@ def get_quant_method(self, layer: torch.nn.Module, self.packed_modules_mapping): return AscendUnquantizedLinearMethod() return AscendLinearMethod(self, prefix, - self.packed_modules_mapping) + self.packed_modules_mapping, layer) elif isinstance(layer, Attention) and \ 'fa_quant_type' in self.quant_description.keys() and \ self.quant_description['fa_quant_type'] is not None: @@ -127,13 +127,13 @@ def get_quant_method(self, layer: torch.nn.Module, self.packed_modules_mapping): return AscendUnquantizedFusedMoEMethod(layer.moe_config) return AscendFusedMoEMethod(self, prefix, - self.packed_modules_mapping) + self.packed_modules_mapping, layer) elif isinstance(layer, VocabParallelEmbedding): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): return UnquantizedEmbeddingMethod() return AscendEmbeddingMethod(self, prefix, - self.packed_modules_mapping) + self.packed_modules_mapping, layer) return None def is_layer_skipped_ascend( @@ -259,10 +259,13 @@ class AscendLinearMethod(LinearMethodBase): """ def __init__(self, quant_config: AscendQuantConfig, prefix: str, - packed_modules_mapping: Dict[str, Any]) -> None: + packed_modules_mapping: Dict[str, Any], + layer: torch.nn.Module) -> None: self.quant_method = get_quant_method(quant_config.quant_description, - prefix, "linear", - packed_modules_mapping) + prefix, + "linear", + packed_modules_mapping, + layer=layer) def create_weights( self, @@ -401,10 +404,13 @@ class AscendFusedMoEMethod(FusedMoEMethodBase): """ def __init__(self, quant_config: AscendQuantConfig, prefix: str, - packed_modules_mapping: Dict[str, Any]): + packed_modules_mapping: Dict[str, Any], + layer: torch.nn.Module): self.quant_method = get_quant_method(quant_config.quant_description, - prefix, "moe", - packed_modules_mapping) + prefix, + "moe", + packed_modules_mapping, + layer=layer) def create_weights( self, @@ -484,7 +490,10 @@ class AscendEmbeddingMethod(AscendLinearMethod): """ def __init__(self, quant_config: AscendQuantConfig, prefix: str, - packed_modules_mapping: Dict[str, Any]) -> None: + packed_modules_mapping: Dict[str, Any], + layer: torch.nn.Module) -> None: self.quant_method = get_quant_method(quant_config.quant_description, - prefix, "linear", - packed_modules_mapping) + prefix, + "linear", + packed_modules_mapping, + layer=layer) diff --git a/vllm_ascend/quantization/utils.py b/vllm_ascend/quantization/utils.py index 6d914c0dade..33c979e236e 100644 --- a/vllm_ascend/quantization/utils.py +++ b/vllm_ascend/quantization/utils.py @@ -1,7 +1,10 @@ from typing import Any, Dict, Optional, Type +import torch from vllm.logger import logger +from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD + from .w4a4_flatquant_dynamic import AscendW4A4FlatQuantDynamicLinearMethod from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod, AscendW4A8DynamicLinearMethod) @@ -60,8 +63,28 @@ def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str, def get_quant_method(quant_description: Dict[str, Any], prefix: str, layer_type: str, - packed_modules_mapping: Optional[Dict[str, Any]] = None): - logger.info_once("Using the vLLM Ascend Quantization now!") + packed_modules_mapping: Optional[Dict[str, Any]], + layer: torch.nn.Module = None): + if quant_description.get("quant_method") == COMPRESSED_TENSORS_METHOD: + return get_quant_method_llmcompressor(layer) + + return get_quant_method_modelslim(quant_description, prefix, layer_type, + packed_modules_mapping) + + +def get_quant_method_llmcompressor(layer: torch.nn.Module): + logger.info_once("Using the vLLM Ascend llmcompressor Quantization now!") + if layer.scheme is None: + raise ValueError("A scheme must be defined for each layer") + return layer.scheme + + +def get_quant_method_modelslim( + quant_description: Dict[str, Any], + prefix: str, + layer_type: str, + packed_modules_mapping: Optional[Dict[str, Any]] = None): + logger.info_once("Using the vLLM Ascend modelslim Quantization now!") if packed_modules_mapping is None: packed_modules_mapping = dict() # Attention diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py index dcd692acfb6..1566d631e9c 100644 --- a/vllm_ascend/quantization/w8a8.py +++ b/vllm_ascend/quantization/w8a8.py @@ -25,8 +25,9 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.ops.fused_moe.experts_selector import select_experts -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, is_enable_nz - +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, + COMPRESSED_TENSORS_METHOD, is_310p, + is_enable_nz) def quant_per_tensor(in_tensor: torch.Tensor, input_scale: torch.Tensor, @@ -147,6 +148,10 @@ def apply( ) quant_bias = layer.quant_bias if tp_rank == 0 else None + if getattr(layer, "ascend_quant_method", + "") == COMPRESSED_TENSORS_METHOD: + quant_bias = bias + if is_310p(): # On 300I Duo platform, we need transpose again if # using nz. This transpose can be skipped in torchair. @@ -185,6 +190,11 @@ def process_weights_after_loading(self, layer): layer.weight.data, ACL_FORMAT_FRACTAL_NZ) layer.weight_scale.data = torch.flatten(layer.weight_scale.data) layer.weight_offset.data = torch.flatten(layer.weight_offset.data) + if getattr(layer, "ascend_quant_method", + "") == COMPRESSED_TENSORS_METHOD: + deq_scale = layer.input_scale.data * layer.weight_scale.data + layer.deq_scale = torch.nn.Parameter(deq_scale, + requires_grad=False) class AscendW8A8FusedMoEMethod: From 84750597acaa011b72b12ab21bc12f431ab5712d Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Fri, 21 Nov 2025 20:41:19 +0800 Subject: [PATCH 03/19] CI problems Signed-off-by: chenxi-hh --- .github/workflows/_e2e_test.yaml | 1 + .../quantization-llm-compressor.md | 1 - .../quantization/llm-compressor/w8a8_int8.py | 45 +++++--------- mypy.ini | 9 +++ requirements.txt | 2 +- tests/e2e/multicard/test_quantization.py | 40 +++++++++++++ tests/ut/quantization/test_quant_config.py | 4 +- .../compressed_tensors/compressed_tensors.py | 60 ++----------------- vllm_ascend/quantization/quant_config.py | 23 ++++--- vllm_ascend/quantization/utils.py | 2 +- vllm_ascend/quantization/w8a8.py | 1 + vllm_ascend/worker/worker_v1.py | 2 - 12 files changed, 88 insertions(+), 102 deletions(-) create mode 100644 tests/e2e/multicard/test_quantization.py diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index be5b43e6373..b8dd9871164 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -203,6 +203,7 @@ jobs: pytest -sv tests/e2e/multicard/test_pipeline_parallel.py pytest -sv tests/e2e/multicard/test_prefix_caching.py pytest -sv tests/e2e/multicard/test_qwen3_moe.py + pytest -sv tests/e2e/multicard/test_quantization.py e2e-4-cards: name: multicard-4 diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md index 7fad89589d1..c523ac4ff72 100644 --- a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md +++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md @@ -53,4 +53,3 @@ for output in outputs: ### Online inference Start the quantized model using vLLM Ascend; no modifications to the startup command are required. - diff --git a/examples/quantization/llm-compressor/w8a8_int8.py b/examples/quantization/llm-compressor/w8a8_int8.py index 80899ba7220..f5e812c12df 100644 --- a/examples/quantization/llm-compressor/w8a8_int8.py +++ b/examples/quantization/llm-compressor/w8a8_int8.py @@ -9,7 +9,6 @@ from llmcompressor.modifiers.awq import AWQModifier from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme, QuantizationType, QuantizationStrategy -from qwen_vl_utils import process_vision_info W8A8_W_cha_A_ten_static_symmetric = { "group_0": QuantizationScheme( @@ -54,11 +53,11 @@ def load_environment_variables(): env_vars = { - 'model_path': os.getenv('MODEL_PATH'), - 'export_path': os.getenv('EXPORT_PATH'), - 'modifier': os.getenv('MODIFIER'), - 'schemes': os.getenv('SCHEMES'), - 'calib_prompt_path': os.getenv('CALIB_PROMPT_PATH') + 'model_path': "Qwen3-32B", + 'export_path': "/llm-compressor/export/GPTQ/W8A8_W_cha_A_ten_static_symmetric", + 'modifier': "GPTQ", + 'schemes': "W8A8_W_cha_A_ten_static_symmetric", + 'calib_prompt_path': "dataset/ultrachat_200k" } # verify export model path @@ -112,7 +111,7 @@ def data_collator(batch): } -def quantize_model(model, config, env_vars, is_vl_model, dataset_dict=None): +def quantize_model(model, env_vars, dataset_dict=None): # since the MoE gate layers are sensitive to quantization, we add them to the ignore # list so they remain at full precision ignore = ["lm_head", "re:.*mlp.down_proj"] @@ -125,29 +124,13 @@ def quantize_model(model, config, env_vars, is_vl_model, dataset_dict=None): ), ] - if env_vars['modifier'] == 'PTQ': - oneshot( - model=model, - recipe=recipe, - trust_remote_code_model=True, - ) - elif is_vl_model: - # quantize the model - oneshot( - model=model, - dataset=dataset_dict, - recipe=recipe, - data_collator=data_collator, - trust_remote_code_model=True, - ) - else: - # quantize the model - oneshot( - model=model, - dataset=dataset_dict, - recipe=recipe, - trust_remote_code_model=True, - ) + # quantize the model + oneshot( + model=model, + dataset=dataset_dict, + recipe=recipe, + trust_remote_code_model=True, + ) def save_quantized_model(model, tokenizer, save_path, save_compressed=False): @@ -175,7 +158,7 @@ def save_quantized_model(model, tokenizer, save_path, save_compressed=False): ds = load_calibration_text_dataset(env_vars["calib_prompt_path"], tokenizer) # Quantize the model - quantize_model(model, config, env_vars, is_vl_model, ds) + quantize_model(model, env_vars, ds) # save the quantized model save_quantized_model(model, tokenizer, env_vars['export_path'], True) \ No newline at end of file diff --git a/mypy.ini b/mypy.ini index 6fe8e6c2986..cdd99e92e0d 100644 --- a/mypy.ini +++ b/mypy.ini @@ -13,4 +13,13 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-lm_eval.*] +ignore_missing_imports = True + +[mypy-compressed_tensors.*] +ignore_missing_imports = True + +[mypy-datasets.*] +ignore_missing_imports = True + +[mypy-llmcompressor.*] ignore_missing_imports = True \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 124be0bee6c..566b6f89f3e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ torchvision wheel pandas-stubs opencv-python-headless<=4.11.0.86 # Required to avoid numpy version conflict with vllm -compressed_tensors +compressed_tensors>=0.11.0 # requirements for disaggregated prefill msgpack diff --git a/tests/e2e/multicard/test_quantization.py b/tests/e2e/multicard/test_quantization.py new file mode 100644 index 00000000000..f14fd15b3cc --- /dev/null +++ b/tests/e2e/multicard/test_quantization.py @@ -0,0 +1,40 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/e2e/multicard/test_quantization.py`. +""" +from modelscope import snapshot_download # type: ignore + +from tests.e2e.conftest import VllmRunner + + +def test_models_distributed_quantized_W8A8(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner(snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"), + tensor_parallel_size=4, + max_model_len=4096, + gpu_memory_utilization=0.8, + distributed_executor_backend="mp", + enforce_eager=True) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + del vllm_model diff --git a/tests/ut/quantization/test_quant_config.py b/tests/ut/quantization/test_quant_config.py index 4622692dd00..b667767ba79 100644 --- a/tests/ut/quantization/test_quant_config.py +++ b/tests/ut/quantization/test_quant_config.py @@ -65,7 +65,7 @@ def test_override_quantization_method(self, mock_is_available): # Test when NPU is available mock_is_available.return_value = True result = AscendQuantConfig.override_quantization_method(None, None) - self.assertEqual(result, ASCEND_QUANTIZATION_METHOD) + self.assertIsNone(result) # Test when NPU is not available mock_is_available.return_value = False @@ -93,7 +93,7 @@ def test_get_quant_method_for_linear(self): self.assertIs(method, mock_ascend_linear.return_value) mock_ascend_linear.assert_called_once_with( self.ascend_config, ".attn", - self.ascend_config.packed_modules_mapping) + self.ascend_config.packed_modules_mapping, linear_layer) def test_get_quant_method_for_attention(self): attention_layer = MagicMock(spec=Attention) diff --git a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py index cc279cf8b74..f95ff7f0215 100644 --- a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py @@ -145,10 +145,10 @@ def get_quant_method( quant_method: LinearMethodBase = UnquantizedLinearMethod() if quant_scheme is not None: layer.scheme = quant_scheme - ascend_quant_config = AscendQuantConfig( - self.quant_description) - quant_method = AscendLinearMethod(ascend_quant_config, - prefix, None, layer) + ascend_quant_config = AscendQuantConfig(self.quant_description + or {}) + quant_method = AscendLinearMethod(ascend_quant_config, prefix, + None, layer) return quant_method return None @@ -250,55 +250,3 @@ def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): self.target_scheme_map = hf_to_vllm_mapper.apply_dict( self.target_scheme_map) self.ignore = hf_to_vllm_mapper.apply_list(self.ignore) - - -class AscendCompressedTensorsLinearMethod(LinearMethodBase): - - def __init__(self, quantization_config: AscendCompressedTensorsConfig): - self.quantization_config = quantization_config - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - layer.scheme.process_weights_after_loading(layer) - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - """ - Use the CompressedTensorsScheme associated with each layer to create - the necessary parameters for the layer. See LinearMethodBase for param - details - """ - weight_loader = extra_weight_attrs.get("weight_loader") - layer.scheme.create_weights( - layer=layer, - input_size=input_size, - input_size_per_partition=input_size_per_partition, - output_partition_sizes=output_partition_sizes, - output_size=output_size, - params_dtype=params_dtype, - weight_loader=weight_loader, - ) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ): - """ - Use the output of create_weights and the CompressedTensorsScheme - associated with the layer to apply the forward pass with the - layer input. See LinearMethodBase for param details - - """ - scheme = layer.scheme - if scheme is None: - raise ValueError("A scheme must be defined for each layer") - return scheme.apply_weights(layer, x, bias=bias) diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 383488f7f94..72c04e50b70 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -94,9 +94,10 @@ def from_config(cls, config: Dict[str, Any]) -> "AscendQuantConfig": @classmethod def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]: - quant_method = hf_quant_cfg.get("quant_method", None) - if quant_method is None and torch.npu.is_available(): - return ASCEND_QUANTIZATION_METHOD + if hf_quant_cfg is not None: + quant_method = hf_quant_cfg.get("quant_method", None) + if quant_method is None and torch.npu.is_available(): + return ASCEND_QUANTIZATION_METHOD return None def get_quant_method(self, layer: torch.nn.Module, @@ -223,6 +224,8 @@ def get_scaled_act_names(self) -> List[str]: ], "gate_up_proj": ["gate_proj", "up_proj"], "in_proj": ["in_proj_qkvz", "in_proj_ba"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] }, "qwen2_5_vl": { "qkv_proj": [ @@ -258,9 +261,11 @@ class AscendLinearMethod(LinearMethodBase): quant_config: The Ascend quantization config. """ - def __init__(self, quant_config: AscendQuantConfig, prefix: str, - packed_modules_mapping: Dict[str, Any], - layer: torch.nn.Module) -> None: + def __init__(self, + quant_config: AscendQuantConfig, + prefix: str, + packed_modules_mapping: Dict[str, Any] | None, + layer: torch.nn.Module = None) -> None: self.quant_method = get_quant_method(quant_config.quant_description, prefix, "linear", @@ -403,9 +408,11 @@ class AscendFusedMoEMethod(FusedMoEMethodBase): quant_config: The Ascend quantization config. """ - def __init__(self, quant_config: AscendQuantConfig, prefix: str, + def __init__(self, + quant_config: AscendQuantConfig, + prefix: str, packed_modules_mapping: Dict[str, Any], - layer: torch.nn.Module): + layer: torch.nn.Module = None): self.quant_method = get_quant_method(quant_config.quant_description, prefix, "moe", diff --git a/vllm_ascend/quantization/utils.py b/vllm_ascend/quantization/utils.py index 33c979e236e..eaaaee86702 100644 --- a/vllm_ascend/quantization/utils.py +++ b/vllm_ascend/quantization/utils.py @@ -63,7 +63,7 @@ def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str, def get_quant_method(quant_description: Dict[str, Any], prefix: str, layer_type: str, - packed_modules_mapping: Optional[Dict[str, Any]], + packed_modules_mapping: Optional[Dict[str, Any]] = None, layer: torch.nn.Module = None): if quant_description.get("quant_method") == COMPRESSED_TENSORS_METHOD: return get_quant_method_llmcompressor(layer) diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py index 1566d631e9c..96957597b61 100644 --- a/vllm_ascend/quantization/w8a8.py +++ b/vllm_ascend/quantization/w8a8.py @@ -29,6 +29,7 @@ COMPRESSED_TENSORS_METHOD, is_310p, is_enable_nz) + def quant_per_tensor(in_tensor: torch.Tensor, input_scale: torch.Tensor, input_offset: torch.Tensor, diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 559030e4b3d..58ac27a0d27 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -157,8 +157,6 @@ def __init__( # FixMe: this is a patch to fix the issue cause by https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170 from vllm.model_executor.layers.linear import \ WEIGHT_LOADER_V2_SUPPORTED - WEIGHT_LOADER_V2_SUPPORTED.append( - "AscendCompressedTensorsLinearMethod") if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED: WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod") From d8b4dbf00372682d9c9aae39a03f95c6f49ac4f9 Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Mon, 24 Nov 2025 09:18:11 +0800 Subject: [PATCH 04/19] CI problems Signed-off-by: chenxi-hh --- vllm_ascend/platform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index a4fec06aa12..7a62b0b946b 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -30,7 +30,7 @@ init_ascend_config) from vllm_ascend.torchair.utils import (check_torchair_cache_exist, delete_torchair_cache_file) -from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, +from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD, enable_sp, is_310p, is_vl_model, prefill_context_parallel_enable, update_aclgraph_sizes, From bb60f2a7dc085c74e6272e8d9e0c530ffc93201b Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Mon, 24 Nov 2025 11:14:11 +0800 Subject: [PATCH 05/19] CI problems Signed-off-by: chenxi-hh --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d5d939f6870..3ee659e6b51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ requires = [ "quart", "numba", "opencv-python-headless<=4.11.0.86", # Required to avoid numpy version conflict with vllm - "compressed_tensors" + "compressed_tensors>=0.11.0" ] build-backend = "setuptools.build_meta" From 9b402496a1af62ac840654b0202778a30cd220fb Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Mon, 24 Nov 2025 14:09:27 +0800 Subject: [PATCH 06/19] CI problems Signed-off-by: chenxi-hh --- tests/ut/test_platform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index f7a6cbd1519..3530185d9a7 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -53,7 +53,7 @@ def test_class_variables(self): "ASCEND_RT_VISIBLE_DEVICES") self.assertEqual(NPUPlatform.dispatch_key, "PrivateUse1") self.assertEqual(NPUPlatform.supported_quantization, - [ASCEND_QUANTIZATION_METHOD]) + [ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD]) def test_is_sleep_mode_available(self): self.assertTrue(self.platform.is_sleep_mode_available()) From 7c15337449b0fa0a04206ca09d6e694c9d278be3 Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Mon, 24 Nov 2025 14:41:01 +0800 Subject: [PATCH 07/19] CI problems Signed-off-by: chenxi-hh --- tests/ut/test_platform.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 3530185d9a7..44ac831e6de 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -9,7 +9,8 @@ from tests.ut.base import TestBase from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is +from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, + COMPRESSED_TENSORS_METHOD, vllm_version_is) if vllm_version_is("0.11.0"): from vllm.config.compilation import CompilationLevel @@ -52,8 +53,9 @@ def test_class_variables(self): self.assertEqual(NPUPlatform.device_control_env_var, "ASCEND_RT_VISIBLE_DEVICES") self.assertEqual(NPUPlatform.dispatch_key, "PrivateUse1") - self.assertEqual(NPUPlatform.supported_quantization, - [ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD]) + self.assertEqual( + NPUPlatform.supported_quantization, + [ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD]) def test_is_sleep_mode_available(self): self.assertTrue(self.platform.is_sleep_mode_available()) From e388e2f829f50a4d1080fca0d7c39eaba039494f Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Mon, 24 Nov 2025 17:32:20 +0800 Subject: [PATCH 08/19] CI problems Signed-off-by: chenxi-hh --- tests/ut/test_platform.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 398ea6371d2..91d30ad9818 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -10,12 +10,7 @@ from tests.ut.base import TestBase from vllm_ascend.platform import NPUPlatform from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, - COMPRESSED_TENSORS_METHOD, vllm_version_is) - -if vllm_version_is("0.11.0"): - from vllm.config.compilation import CompilationLevel -else: - from vllm.config.compilation import CompilationMode + COMPRESSED_TENSORS_METHOD) class TestNPUPlatform(TestBase): From 7a076814460b01d9f1fb82c09bf3e116adaa348c Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Tue, 25 Nov 2025 16:19:53 +0800 Subject: [PATCH 09/19] CI problems Signed-off-by: chenxi-hh --- examples/quantization/llm-compressor/w8a8_int8.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/quantization/llm-compressor/w8a8_int8.py b/examples/quantization/llm-compressor/w8a8_int8.py index f5e812c12df..cfd75b916c7 100644 --- a/examples/quantization/llm-compressor/w8a8_int8.py +++ b/examples/quantization/llm-compressor/w8a8_int8.py @@ -53,11 +53,11 @@ def load_environment_variables(): env_vars = { - 'model_path': "Qwen3-32B", + 'model_path': "Qwen/Qwen3-32B", 'export_path': "/llm-compressor/export/GPTQ/W8A8_W_cha_A_ten_static_symmetric", 'modifier': "GPTQ", 'schemes': "W8A8_W_cha_A_ten_static_symmetric", - 'calib_prompt_path': "dataset/ultrachat_200k" + 'calib_prompt_path': "HuggingFaceH4/ultrachat_200k" } # verify export model path From 7c1848eed9ab566fb8e52f05939eb20b2f15ecff Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Tue, 25 Nov 2025 17:41:03 +0800 Subject: [PATCH 10/19] CI problems Signed-off-by: chenxi-hh --- examples/quantization/llm-compressor/w8a8_int8.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/quantization/llm-compressor/w8a8_int8.py b/examples/quantization/llm-compressor/w8a8_int8.py index cfd75b916c7..9a6cb392f0c 100644 --- a/examples/quantization/llm-compressor/w8a8_int8.py +++ b/examples/quantization/llm-compressor/w8a8_int8.py @@ -151,10 +151,6 @@ def save_quantized_model(model, tokenizer, save_path, save_compressed=False): ) tokenizer = TOKENIZER_DICT[model_type].from_pretrained(env_vars['model_path'], trust_remote_code=True) - # Load the calibration dataset - if env_vars["calib_prompt_path"] is None: - env_vars["calib_prompt_path"] = "dataset/ultrachat_200k" - ds = load_calibration_text_dataset(env_vars["calib_prompt_path"], tokenizer) # Quantize the model From 85283b1fd536955fad55775f7f296bf164f04a44 Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Tue, 25 Nov 2025 21:11:36 +0800 Subject: [PATCH 11/19] CI problems Signed-off-by: chenxi-hh --- tests/e2e/multicard/test_quantization.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/e2e/multicard/test_quantization.py b/tests/e2e/multicard/test_quantization.py index f14fd15b3cc..aaeb62401e5 100644 --- a/tests/e2e/multicard/test_quantization.py +++ b/tests/e2e/multicard/test_quantization.py @@ -31,10 +31,8 @@ def test_models_distributed_quantized_W8A8(): ] max_tokens = 5 with VllmRunner(snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"), - tensor_parallel_size=4, + tensor_parallel_size=2, max_model_len=4096, gpu_memory_utilization=0.8, - distributed_executor_backend="mp", - enforce_eager=True) as vllm_model: + enforce_eager=False) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) - del vllm_model From 24f6596d92e27e9f3740b2f53009aa4248e45ecc Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Tue, 25 Nov 2025 21:12:47 +0800 Subject: [PATCH 12/19] CI problems Signed-off-by: chenxi-hh --- .github/workflows/_e2e_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index ea56813c0a9..272116aa3d0 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -182,6 +182,7 @@ jobs: VLLM_USE_MODELSCOPE: True if: ${{ inputs.type == 'full' }} run: | + pytest -sv tests/e2e/multicard/test_quantization.py pytest -sv tests/e2e/multicard/test_aclgraph_capture_replay.py pytest -sv tests/e2e/multicard/test_torchair_graph_mode.py pytest -sv tests/e2e/multicard/test_full_graph_mode.py @@ -207,7 +208,6 @@ jobs: pytest -sv tests/e2e/multicard/test_pipeline_parallel.py pytest -sv tests/e2e/multicard/test_prefix_caching.py pytest -sv tests/e2e/multicard/test_qwen3_moe.py - pytest -sv tests/e2e/multicard/test_quantization.py e2e-4-cards: name: multicard-4 From 8c59b6c85929708adf99858e3aa6b5e4fbd4bbd8 Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Wed, 26 Nov 2025 14:32:12 +0800 Subject: [PATCH 13/19] CI problems Signed-off-by: chenxi-hh --- .../feature_guide/quantization-llm-compressor.md | 12 +++++++++++- tests/e2e/multicard/test_quantization.py | 10 +++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md index c523ac4ff72..df6f489035a 100644 --- a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md +++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md @@ -1,7 +1,17 @@ -# Quantization Guide +# llm-compressor Quantization Guide Model quantization is a technique that reduces the size and computational requirements of a model by lowering the data precision of the weights and activation values in the model, thereby saving the memory and improving the inference speed. +## Supported llm-compressor Quantization Types + +Support CompressedTensorsW8A8 static weight. + +weight: per-channel, int8, symmetric; activation: per-tensor, int8, symmetric. + +Support CompressedTensorsW8A8Dynamic weight. + +weight: per-channel, int8, symmetric; activation: per-token, int8, symmetric, dynamic. + ## Install llm-compressor To quantize a model, you should install [llm-compressor](https://github.com/vllm-project/llm-compressor/blob/main/README.md). It is a unified library for creating compressed models for faster inference with vLLM. diff --git a/tests/e2e/multicard/test_quantization.py b/tests/e2e/multicard/test_quantization.py index aaeb62401e5..ecb2f49575f 100644 --- a/tests/e2e/multicard/test_quantization.py +++ b/tests/e2e/multicard/test_quantization.py @@ -35,4 +35,12 @@ def test_models_distributed_quantized_W8A8(): max_model_len=4096, gpu_memory_utilization=0.8, enforce_eager=False) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) + vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens) + + golden_results = [ + 'Hello, my name is the head of state and', + ] + + for i in range(len(vllm_output)): + assert golden_results[i] == vllm_output[i][1] + print(f"Generated text: {vllm_output[i][1]!r}") From 7c01955332aa64ea7584aaf8d7cb80d2471e6985 Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Wed, 26 Nov 2025 15:14:06 +0800 Subject: [PATCH 14/19] CI problems Signed-off-by: chenxi-hh --- vllm_ascend/platform.py | 5 ++--- vllm_ascend/quantization/w8a8.py | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 9a57ed63230..5140d697f15 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -30,9 +30,8 @@ init_ascend_config) from vllm_ascend.torchair.utils import (check_torchair_cache_exist, delete_torchair_cache_file) -from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, - COMPRESSED_TENSORS_METHOD, - AscendDeviceType, +from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, + COMPRESSED_TENSORS_METHOD, AscendDeviceType, enable_sp, get_ascend_device_type, is_vl_model, prefill_context_parallel_enable, update_aclgraph_sizes, diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py index b109716e0af..8a7bbfe7263 100644 --- a/vllm_ascend/quantization/w8a8.py +++ b/vllm_ascend/quantization/w8a8.py @@ -25,9 +25,9 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.ops.fused_moe.experts_selector import select_experts -from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType, - get_ascend_device_type, is_enable_nz, - COMPRESSED_TENSORS_METHOD) +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, + COMPRESSED_TENSORS_METHOD, AscendDeviceType, + get_ascend_device_type, is_enable_nz) def quant_per_tensor(in_tensor: torch.Tensor, @@ -153,7 +153,7 @@ def apply( if getattr(layer, "ascend_quant_method", "") == COMPRESSED_TENSORS_METHOD: quant_bias = bias - + if get_ascend_device_type() == AscendDeviceType._310P: # On 300I Duo platform, we need transpose again if # using nz. This transpose can be skipped in torchair. From 2f24a00758e351371948b89469e74465b03790d5 Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Wed, 26 Nov 2025 15:21:18 +0800 Subject: [PATCH 15/19] CI problems Signed-off-by: chenxi-hh --- tests/e2e/multicard/test_quantization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/multicard/test_quantization.py b/tests/e2e/multicard/test_quantization.py index ecb2f49575f..67c57daf09e 100644 --- a/tests/e2e/multicard/test_quantization.py +++ b/tests/e2e/multicard/test_quantization.py @@ -27,7 +27,7 @@ def test_models_distributed_quantized_W8A8(): example_prompts = [ - "Hello, my name is", + "The president of the United States is", ] max_tokens = 5 with VllmRunner(snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"), @@ -38,7 +38,7 @@ def test_models_distributed_quantized_W8A8(): vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens) golden_results = [ - 'Hello, my name is the head of state and', + 'The president of the United States is the head of state and', ] for i in range(len(vllm_output)): From e7e110079359c3b7f8a2280f08e608a5f224aec8 Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Wed, 26 Nov 2025 17:07:36 +0800 Subject: [PATCH 16/19] CI problems Signed-off-by: chenxi-hh --- vllm_ascend/platform.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 5140d697f15..9e8b2593109 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -30,13 +30,13 @@ init_ascend_config) from vllm_ascend.torchair.utils import (check_torchair_cache_exist, delete_torchair_cache_file) -from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, - COMPRESSED_TENSORS_METHOD, AscendDeviceType, - enable_sp, get_ascend_device_type, is_vl_model, - prefill_context_parallel_enable, - update_aclgraph_sizes, - update_cudagraph_capture_sizes, - update_default_aclgraph_sizes) + +# isort: off +from vllm_ascend.utils import ( + ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD, AscendDeviceType, + enable_sp, get_ascend_device_type, is_vl_model, + prefill_context_parallel_enable, update_aclgraph_sizes, + update_cudagraph_capture_sizes, update_default_aclgraph_sizes) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig From 14354e9bba70673a5c035838396c97fde378b909 Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Thu, 27 Nov 2025 09:20:58 +0800 Subject: [PATCH 17/19] CI problems Signed-off-by: chenxi-hh --- .../user_guide/feature_guide/quantization-llm-compressor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md index df6f489035a..bd856d5f586 100644 --- a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md +++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md @@ -16,7 +16,7 @@ weight: per-channel, int8, symmetric; activation: per-token, int8, symmetric, dy To quantize a model, you should install [llm-compressor](https://github.com/vllm-project/llm-compressor/blob/main/README.md). It is a unified library for creating compressed models for faster inference with vLLM. -Install llm-compressor: +Install llm-compressor ```bash pip install llmcompressor From b62bf8c253a59746439bfd2b6f31bb2ac5799c4f Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Thu, 27 Nov 2025 21:06:11 +0800 Subject: [PATCH 18/19] CI problems Signed-off-by: chenxi-hh --- .../user_guide/feature_guide/quantization-llm-compressor.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md index bd856d5f586..b0c543f47a9 100644 --- a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md +++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md @@ -4,11 +4,11 @@ Model quantization is a technique that reduces the size and computational requir ## Supported llm-compressor Quantization Types -Support CompressedTensorsW8A8 static weight. +Support CompressedTensorsW8A8 static weight: weight: per-channel, int8, symmetric; activation: per-tensor, int8, symmetric. -Support CompressedTensorsW8A8Dynamic weight. +Support CompressedTensorsW8A8Dynamic weight: weight: per-channel, int8, symmetric; activation: per-token, int8, symmetric, dynamic. From 6dbc2b311b39a3eda7d584136474840b5a75facf Mon Sep 17 00:00:00 2001 From: chenxi-hh Date: Fri, 28 Nov 2025 09:07:06 +0800 Subject: [PATCH 19/19] CI problems Signed-off-by: chenxi-hh --- .../user_guide/feature_guide/quantization-llm-compressor.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md index b0c543f47a9..a97b4de2940 100644 --- a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md +++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md @@ -4,11 +4,11 @@ Model quantization is a technique that reduces the size and computational requir ## Supported llm-compressor Quantization Types -Support CompressedTensorsW8A8 static weight: +Support CompressedTensorsW8A8 static weight weight: per-channel, int8, symmetric; activation: per-tensor, int8, symmetric. -Support CompressedTensorsW8A8Dynamic weight: +Support CompressedTensorsW8A8Dynamic weight weight: per-channel, int8, symmetric; activation: per-token, int8, symmetric, dynamic.