From dcea881b2254b9cc29a3f26f64dba57384d1acbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 5 Dec 2025 16:56:40 +0300 Subject: [PATCH 001/175] Automatic quant_model_description.json detection support --- python/sglang/srt/configs/model_config.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 26dfbe5eb1d5..e4af64b1a116 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -16,6 +16,7 @@ import logging import math import os +from pathlib import Path from enum import Enum, IntEnum, auto from typing import Any, List, Optional, Set, Union @@ -36,7 +37,7 @@ from sglang.utils import is_in_ci logger = logging.getLogger(__name__) - +_is_npu = is_npu() class AttentionArch(IntEnum): MLA = auto() @@ -560,6 +561,16 @@ def _parse_quant_hf_config(self): quant_cfg = self._parse_modelopt_quant_config(quant_config_dict) return quant_cfg + def _find_quant_modelslim_config(self): + quant_config_file = Path(self.model_path, "quant_model_description.json") + if quant_config_file.is_file(): + with open(quant_config_file) as f: + quant_cfg = json.load(f) + else: + quant_cfg = None + + return quant_cfg + def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]: """Parse ModelOpt quantization config and return the appropriate quant_method.""" json_quant_configs = quant_config_dict["quantization"] @@ -678,6 +689,9 @@ def _verify_quantization(self) -> None: # Parse quantization method from the HF model config, if available. quant_cfg = self._parse_quant_hf_config() + if _is_npu: + quant_cfg = self._find_quant_modelslim_config() + self.quantization = 'modelslim' if quant_cfg is not None: quant_method = quant_cfg.get( From aa0a0aa4e6171a9de10c459246dd60162013ae02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 5 Dec 2025 17:25:20 +0300 Subject: [PATCH 002/175] Add w4a4 support --- .../hardware_backend/npu/quantization/w4a4.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a4.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py new file mode 100644 index 000000000000..e1d7a8b8a6cb --- /dev/null +++ b/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py @@ -0,0 +1,35 @@ +class NPU_W4A4DynamicLinearMethodImpl: + """Linear method for NPU W4A4_DYNAMIC.""" + + def __init__(self): + self.transpose_weight = True + + @staticmethod + def apply( + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + tp_rank: Optional[int] = 0, + ) -> torch.Tensor: + original_dtype = x.dtype + quant_out, dynamic_scale = torch_npu.npu_dynamic_quant( + x, dst_type=torch.quint4x2 + ) + return torch_npu.npu_quant_matmul( + quant_out, + layer.weight, + layer.weight_scale, + pertoken_scale=dynamic_scale, + bias=bias, + output_dtype=original_dtype, + ) + + def process_weights_after_loading(self, layer): + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight_scale.data = layer.weight_scale.data.flatten() + layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32) + layer.weight_offset.data = layer.weight_offset.data.flatten() + layer.weight.data = torch_npu.npu_convert_weight_to_int4pack( + layer.weight.data.to(torch.int32) + ) From 6c845aded27617bd937286e714d18d7c8560a5cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 5 Dec 2025 17:32:28 +0300 Subject: [PATCH 003/175] Refactor w8a8 --- .../hardware_backend/npu/quantization/w8a8.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w8a8.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py new file mode 100644 index 000000000000..f9ad7f4a16ac --- /dev/null +++ b/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py @@ -0,0 +1,100 @@ +from typing import TYPE_CHECKING, List, Optional + +import torch + +from sglang.srt.hardware_backend.npu.utils import npu_format_cast +from sglang.srt.hardware_backend.npu.quantization.utils import _NPULinearMethodBase + +class NPUW8A8Int8LinearMethod(_NPULinearMethodBase): + """Linear method for NPU W8A8.""" + + def __init__(self): + self.transpose_weight = True + + @staticmethod + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + from sglang.srt.layers.linear import RowParallelLinear + + original_dtype = x.dtype + if original_dtype != torch.int8: + x = torch.ops.npu.npu_quantize( + x, + layer.aclnn_input_scale_reciprocal, + layer.aclnn_input_offset, + torch.qint8, + -1, + False, + ) + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in Attention TP>1 case) + if isinstance(layer, RowParallelLinear) and layer.tp_rank > 0: + quant_bias = None + else: + quant_bias = layer.quant_bias + return torch.ops.npu.npu_quant_matmul( + x, + layer.weight, + layer.deq_scale, + bias=quant_bias, + output_dtype=original_dtype, + ) + + def process_weights_after_loading(self, layer: torch.nn.Module): + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight.data = npu_format_cast(layer.weight.data) + + layer.weight_scale.data = torch.flatten(layer.weight_scale.data) + layer.weight_offset.data = torch.flatten(layer.weight_offset.data) + + expanding_factor = layer.weight.data.shape[0] + layer.aclnn_input_scale = torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + layer.aclnn_input_offset = torch.nn.Parameter( + layer.input_offset.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + + +class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase): + """Linear method for NPU W8A8_DYNAMIC.""" + + def __init__(self): + self.transpose_weight = True + + @staticmethod + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + original_dtype = x.dtype + quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant(x) + return torch.ops.npu.npu_quant_matmul( + quant_out, + layer.weight, + layer.weight_scale, + pertoken_scale=dynamic_scale, + bias=bias, + output_dtype=original_dtype, + ) + + def process_weights_after_loading(self, layer: torch.nn.Module): + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight.data = npu_format_cast(layer.weight.data) + + layer.weight_scale.data = layer.weight_scale.data.flatten() + layer.weight_offset.data = layer.weight_offset.data.flatten() From dee644b2946df18006dfad370a022cf5b60beb91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 5 Dec 2025 17:34:04 +0300 Subject: [PATCH 004/175] Add import section --- .../sglang/srt/hardware_backend/npu/quantization/w4a4.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py index e1d7a8b8a6cb..4676b4655872 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py @@ -1,3 +1,10 @@ +from typing import TYPE_CHECKING, List, Optional + +import torch + +from sglang.srt.hardware_backend.npu.utils import npu_format_cast +from sglang.srt.hardware_backend.npu.quantization.utils import _NPULinearMethodBase + class NPU_W4A4DynamicLinearMethodImpl: """Linear method for NPU W4A4_DYNAMIC.""" From 35b8983a65b28a51eaca5333bade407716ef97eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 5 Dec 2025 17:35:26 +0300 Subject: [PATCH 005/175] Create quantization utils file --- .../hardware_backend/npu/quantization/utils.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/utils.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/utils.py b/python/sglang/srt/hardware_backend/npu/quantization/utils.py new file mode 100644 index 000000000000..0350d85e6400 --- /dev/null +++ b/python/sglang/srt/hardware_backend/npu/quantization/utils.py @@ -0,0 +1,15 @@ +from typing import TYPE_CHECKING, List, Optional + +from sglang.srt.layers.quantization.base_config import LinearMethodBase + +if TYPE_CHECKING: + from sglang.srt.layers.quantization.base_config import QuantizationConfig + +class _NPULinearMethodBase(LinearMethodBase): + + def __init__( + self, + quant_config: Optional["QuantizationConfig"] = None, + ): + super().__init__() + self.quant_config = quant_config From 311cc288153fc00c65d0766287a63cbde5988347 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 5 Dec 2025 17:40:05 +0300 Subject: [PATCH 006/175] Create w4a16 --- .../npu/quantization/w4a16.py | 195 ++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a16.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a16.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a16.py new file mode 100644 index 000000000000..2f3f2a4539f3 --- /dev/null +++ b/python/sglang/srt/hardware_backend/npu/quantization/w4a16.py @@ -0,0 +1,195 @@ +from typing import TYPE_CHECKING + +import numpy as np +import torch + +from sglang.srt.hardware_backend.npu.utils import npu_format_cast +from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase + +class NPUW4A8Int4DynamicMoEMethod(FusedMoEMethodBase): + + def __init__(self) -> None: + self.group_size = 256 + self.tp_size = 1 + + def pack_to_int32(self, weight: torch.Tensor): + assert weight.dim() == 3 + if weight.dtype == torch.int32: + # pack 8 int4 to int32, we use a int32 to represent a int4 + assert ( + weight.shape[-1] % 8 == 0 + ), "the last dim of weight needs to be divided by 8" + new_weight = torch.ops.npu.npu_convert_weight_to_int4pack( + weight.flatten(0, 1) + ) + new_weight = new_weight.view(weight.shape[0], weight.shape[1], -1) + elif weight.dtype == torch.int8: + # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4 + assert ( + weight.shape[-1] % 4 == 0 + ), "the last dim of weight needs to be divided by 4" + new_weight = weight.view(torch.int32).contiguous() + else: + raise ValueError(f"{weight.dtype=} is not supported !") + return new_weight + + def unpack_from_int32( + self, + value: torch.Tensor, + num_bits: int, + shape: torch.Size = None, + packed_dim=1, + ) -> torch.Tensor: + """ + Unpacks a tensor of packed int32 weights into individual int8s, maintaining the + original bit range. + + Return tensors in int8 + + :param value: tensor to unpack + :param num_bits: number of bits to unpack each data point into + :param shape: shape to unpack into, used to remove padding + :returns: unpacked int8 tensor + """ + if value.dtype is not torch.int32: + raise ValueError( + f"Expected {torch.int32} but got {value.dtype}, Aborting unpack." + ) + + if num_bits > 8: + raise ValueError("Unpacking is only supported for less than 8 bits") + + pack_factor = 32 // num_bits + + # unpack + mask = (1 << num_bits) - 1 + + if packed_dim == 1: + unpacked = torch.zeros( + (value.shape[0], value.shape[1] * pack_factor), + device=value.device, + dtype=torch.int32, + ) + for i in range(pack_factor): + unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask + + # remove padding + if shape is not None: + original_row_size = int(shape[1]) + unpacked = unpacked[:, :original_row_size] + else: + unpacked = torch.zeros( + (value.shape[0] * pack_factor, value.shape[1]), + device=value.device, + dtype=torch.int32, + ) + for i in range(pack_factor): + unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask + + # remove padding + original_row_size = int(shape[0]) + unpacked = unpacked[:original_row_size, :] + + # bits are packed in unsigned format, reformat to signed + # update the value range from unsigned to signed + offset = pow(2, num_bits) // 2 + unpacked = (unpacked - offset).to(torch.int8) + + return unpacked + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + w13_weight_scale = layer.w13_weight_scale.data.transpose(-1, -2).contiguous() + w2_weight_scale = layer.w2_weight_scale.data.transpose(-1, -2).contiguous() + layer.w13_weight_scale = torch.nn.Parameter( + w13_weight_scale, requires_grad=False + ) + layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, requires_grad=False) + + layer.w13_weight_offset = torch.nn.Parameter( + layer.w13_weight_offset.data.transpose(-1, -2).contiguous(), + requires_grad=False, + ) + layer.w2_weight_offset = torch.nn.Parameter( + layer.w2_weight_offset.data.transpose(-1, -2).contiguous(), + requires_grad=False, + ) + + # w = [n, k // 8] --> [k, n // 8] + # w13_weight = layer.w13_weight.data.transpose(1, 2).contiguous() + # w2_weight = layer.w2_weight.data.transpose(1, 2).contiguous() + unpacked_w13_weight = ( + self.unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4) + .view(layer.w13_weight.data.shape[0], layer.w13_weight.data.shape[1], -1) + .transpose(1, 2) + .contiguous() + .int() + ) + unpacked_w2_weight = ( + self.unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4) + .view(layer.w2_weight.data.shape[0], layer.w2_weight.data.shape[1], -1) + .transpose(1, 2) + .contiguous() + .int() + ) + + w13_weight = self.pack_to_int32(unpacked_w13_weight) + w2_weight = self.pack_to_int32(unpacked_w2_weight) + + layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + + def create_moe_runner( + self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" + ): + self.moe_runner_config = moe_runner_config + + def apply( + self, + layer, + dispatch_output: "StandardDispatchOutput", + ) -> "CombineInput": + # FIXME W4A8 only support with deepep + raise NotImplementedError( + f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep" + ) + + def apply_without_routing_weights( + self, + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ): + hidden_states = torch.ops.npu.npu_grouped_matmul( + x=[hidden_states], + weight=[self.w13_weight], + scale=[self.w13_weight_scale], + bias=[self.w13_scale_bias], + per_token_scale=[hidden_states_scale], + group_list=group_list, + split_item=2, + group_type=0, + group_list_type=group_list_type, + output_dtype=output_dtype, + )[0] + + # act_fn: swiglu + hidden_states = torch.ops.npu.npu_swiglu(hidden_states) + hidden_states, swiglu_out_scale = torch.ops.npu.npu_dynamic_quant(hidden_states) + + hidden_states = torch.ops.npu.npu_grouped_matmul( + x=[hidden_states], + weight=[self.w2_weight], + scale=[self.w2_weight_scale], + bias=[self.w2_scale_bias], + per_token_scale=[swiglu_out_scale], + group_list=group_list, + split_item=2, + group_type=0, + group_list_type=group_list_type, + output_dtype=output_dtype, + )[0] + + return hidden_states From 6869ebf700f54aeb9b26ff65744b6904b0052a1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 5 Dec 2025 17:43:02 +0300 Subject: [PATCH 007/175] Create w4a8.py --- .../hardware_backend/npu/quantization/w4a8.py | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a8.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py new file mode 100644 index 000000000000..3696c4d36380 --- /dev/null +++ b/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py @@ -0,0 +1,148 @@ +from typing import TYPE_CHECKING + +import numpy as np +import torch + +from sglang.srt.hardware_backend.npu.utils import npu_format_cast +from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase + +class NPUW4A8Int4DynamicMoEMethod(FusedMoEMethodBase): + + def __init__(self) -> None: + self.group_size = 256 + self.tp_size = 1 + + def process_scale(self, weight: torch.Tensor, scale, per_group_scale): + scale = scale.transpose(1, 2).contiguous() + per_group_scale = per_group_scale.transpose(1, 2).contiguous() + group_num, k, n = weight.shape + # the weight of the new version is reduced by half by pack n, so it needs to be restored + n = n * 2 + per_group_scale = per_group_scale.reshape(group_num, -1, n) + group_num, quantgroup_num, n = per_group_scale.shape + bias = None + + scale_fp32 = (scale * per_group_scale).to(torch.float16).to(torch.float32) + scale_fp32_np = scale_fp32.cpu().numpy() + scale_fp32_np.dtype = np.uint32 + sscale_uint64 = np.zeros((group_num, quantgroup_num, n * 2), dtype=np.uint32) + + sscale_uint64[..., ::2] = scale_fp32_np + + sscale_uint64_buffer = np.frombuffer( + sscale_uint64.tobytes(), dtype=np.int64 + ).copy() + sscale_uint64_tensor = torch.from_numpy(sscale_uint64_buffer).reshape( + group_num, quantgroup_num, n + ) + sscale_uint64_tensor = sscale_uint64_tensor.npu() + return sscale_uint64_tensor, bias + + def update_bias(self, layer, w13_bias, w2_bias): + layer.w13_scale_bias.data = ( + layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) + ) + layer.w2_scale_bias.data = ( + layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) + ) + + def pack_to_int32(self, weight: torch.Tensor): + # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4 + assert ( + weight.shape[-1] % 4 == 0 + ), "the last dim of weight needs to be divided by 4" + return weight.view(torch.int32).contiguous() + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.w13_weight = torch.nn.Parameter( + layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False + ) + layer.w2_weight = torch.nn.Parameter( + layer.w2_weight.data.transpose(1, 2).contiguous(), requires_grad=False + ) + + w13_weight_scale_second = ( + layer.w13_weight_scale_second.data + if hasattr(layer, "w13_weight_scale_second") + else None + ) + w2_weight_scale_second = ( + layer.w2_weight_scale_second.data + if hasattr(layer, "w2_weight_scale_second") + else None + ) + layer.w13_weight_scale.data, w13_bias = self.process_scale( + layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second + ) + layer.w2_weight_scale.data, w2_bias = self.process_scale( + layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second + ) + if hasattr(layer, "w13_weight_scale_second"): + # scale_second is no longer used, release this part of the memory + del layer.w13_weight_scale_second + del layer.w2_weight_scale_second + del layer.w13_weight_offset_second + del layer.w2_weight_offset_second + + self.update_bias(layer, w13_bias, w2_bias) + + layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) + layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) + layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data) + layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) + + def create_moe_runner( + self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" + ): + self.moe_runner_config = moe_runner_config + + def apply( + self, + layer, + dispatch_output: "StandardDispatchOutput", + ) -> "CombineInput": + # FIXME W4A8 only support with deepep + raise NotImplementedError( + f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep" + ) + + def apply_without_routing_weights( + self, + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ): + hidden_states = torch.ops.npu.npu_grouped_matmul( + x=[hidden_states], + weight=[self.w13_weight], + scale=[self.w13_weight_scale], + bias=[self.w13_scale_bias], + per_token_scale=[hidden_states_scale], + group_list=group_list, + split_item=2, + group_type=0, + group_list_type=group_list_type, + output_dtype=output_dtype, + )[0] + + # act_fn: swiglu + hidden_states = torch.ops.npu.npu_swiglu(hidden_states) + hidden_states, swiglu_out_scale = torch.ops.npu.npu_dynamic_quant(hidden_states) + + hidden_states = torch.ops.npu.npu_grouped_matmul( + x=[hidden_states], + weight=[self.w2_weight], + scale=[self.w2_weight_scale], + bias=[self.w2_scale_bias], + per_token_scale=[swiglu_out_scale], + group_list=group_list, + split_item=2, + group_type=0, + group_list_type=group_list_type, + output_dtype=output_dtype, + )[0] + + return hidden_states From c7d6dd5c521db342e9ebed6e07317d4f4c4ee53f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 5 Dec 2025 17:46:38 +0300 Subject: [PATCH 008/175] Rename w4a16.py to w4a16_moe.py --- .../hardware_backend/npu/quantization/{w4a16.py => w4a16_moe.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python/sglang/srt/hardware_backend/npu/quantization/{w4a16.py => w4a16_moe.py} (100%) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a16.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py similarity index 100% rename from python/sglang/srt/hardware_backend/npu/quantization/w4a16.py rename to python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py From 7ffe0f62314626950bb3385c18434cbe51613c41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 5 Dec 2025 17:46:48 +0300 Subject: [PATCH 009/175] Rename w4a8.py to w4a8_moe.py --- .../hardware_backend/npu/quantization/{w4a8.py => w4a8_moe.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python/sglang/srt/hardware_backend/npu/quantization/{w4a8.py => w4a8_moe.py} (100%) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py similarity index 100% rename from python/sglang/srt/hardware_backend/npu/quantization/w4a8.py rename to python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py From e2d8889cbcf704e95775ac398c5622855262364d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 5 Dec 2025 17:49:14 +0300 Subject: [PATCH 010/175] Create w8a8_moe --- .../npu/quantization/w8a8_moe.py | 215 ++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py b/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py new file mode 100644 index 000000000000..789e5b516ced --- /dev/null +++ b/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py @@ -0,0 +1,215 @@ +from typing import TYPE_CHECKING + +import numpy as np +import torch + +from sglang.srt.hardware_backend.npu.utils import npu_format_cast +from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase + +def npu_fused_experts( + hidden_states: torch.Tensor, + w13: torch.Tensor, + w13_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + **kwargs, +): + w13_offset = kwargs.get("w13_offset", None) + w2_offset = kwargs.get("w2_offset", None) + use_wna16 = kwargs.get("use_wna16", False) + + original_shape = hidden_states.shape + original_dtype = hidden_states.dtype + scale_dtype = original_dtype if original_dtype == torch.bfloat16 else torch.float32 + if len(original_shape) == 3: + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + num_tokens = hidden_states.shape[0] + num_experts = w13.shape[0] + row_idx_len = num_tokens * top_k + row_idx = ( + torch.arange(0, row_idx_len, dtype=torch.int32, device=topk_weights.device) + .view(top_k, -1) + .permute(1, 0) + .contiguous() + ) + hidden_states, expanded_row_idx, expanded_expert_idx = ( + torch.ops.npu.npu_moe_init_routing( + hidden_states, row_idx=row_idx, expert_idx=topk_ids, active_num=num_tokens + ) + ) + expert_tokens = torch.ops.npu.npu_moe_compute_expert_tokens( + expanded_expert_idx, num_experts + ) + expert_tokens = expert_tokens.to(torch.int64) + # gmm1: gate_up_proj + if not use_wna16: + hidden_states, pertoken_scale = torch.ops.npu.npu_dynamic_quant(hidden_states) + scale_args13 = { + "scale": [w13_scale.to(scale_dtype)], + "per_token_scale": [pertoken_scale], + } + else: + scale_args13 = { + "antiquant_scale": [w13_scale], + "antiquant_offset": [w13_offset], + } + + hidden_states = torch.ops.npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w13], + **scale_args13, + split_item=2, + group_list_type=0, + group_type=0, + group_list=expert_tokens, + output_dtype=original_dtype, + )[0] + # act_fn: swiglu + hidden_states = torch.ops.npu.npu_swiglu(hidden_states) + if not use_wna16: + hidden_states, pertoken_scale = torch.ops.npu.npu_dynamic_quant(hidden_states) + + scale_args2 = { + "scale": [w2_scale.to(scale_dtype)], + "per_token_scale": [pertoken_scale], + } + else: + scale_args2 = {"antiquant_scale": [w2_scale], "antiquant_offset": [w2_offset]} + # gmm2: down_proj + hidden_states = torch.ops.npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w2], + **scale_args2, + split_item=2, + group_list_type=0, + group_type=0, + group_list=expert_tokens, + output_dtype=original_dtype, + )[0] + + final_hidden_states = torch.ops.npu.npu_moe_finalize_routing( + hidden_states, + skip1=None, + skip2=None, + bias=None, + scales=topk_weights, + expanded_src_to_dst_row=expanded_row_idx, + export_for_source_row=topk_ids, + ) + if len(original_shape) == 3: + final_hidden_states = final_hidden_states.view(original_shape) + return final_hidden_states + +class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase): + + ### TODO remove this ### + def release_weight_cache(self, weight: torch.Tensor): + # .contiguous() introduces additional memory overhead and needs to be released using resize_(0) + origin_weight = weight.data.transpose(1, 2) + new_weight = origin_weight.contiguous() + origin_weight.untyped_storage().resize_(0) + return new_weight + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + weight_data = self.release_weight_cache(layer.w13_weight.data) + layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False) + + weight_data = self.release_weight_cache(layer.w2_weight.data) + layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False) + + layer.w13_weight_scale = torch.nn.Parameter( + layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32), + requires_grad=False, + ) + layer.w2_weight_scale = torch.nn.Parameter( + layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False + ) + layer.w13_weight_offset = torch.nn.Parameter( + layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + ) + layer.w2_weight_offset = torch.nn.Parameter( + layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + ) + + layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) + layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) + + def create_moe_runner( + self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" + ): + self.moe_runner_config = moe_runner_config + + def apply( + self, + layer, + dispatch_output: "StandardDispatchOutput", + ) -> "CombineInput": + from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput + + x = dispatch_output.hidden_states + topk_output = dispatch_output.topk_output + + topk_weights, topk_ids, _ = topk_output + topk_ids = topk_ids.to(torch.int32) + topk_weights = topk_weights.to(x.dtype) + output = npu_fused_experts( + hidden_states=x, + w13=layer.w13_weight, + w13_scale=layer.w13_weight_scale, + w2=layer.w2_weight, + w2_scale=layer.w2_weight_scale, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=topk_ids.shape[1], + ) + return StandardCombineInput(hidden_states=output) + + def apply_without_routing_weights( + self, + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ): + # gmm1: gate_up_proj + hidden_states = torch.ops.npu.npu_grouped_matmul( + x=[hidden_states], + weight=[layer.w13_weight], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=torch.int32, + )[0] + + # act_fn: swiglu + hidden_states, swiglu_out_scale = torch.ops.npu.npu_dequant_swiglu_quant( + x=hidden_states, + weight_scale=layer.w13_weight_scale, + activation_scale=hidden_states_scale, + bias=None, + quant_scale=None, + quant_offset=None, + group_index=group_list, + activate_left=True, + quant_mode=1, + ) + + # gmm2: down_proj + hidden_states = torch.ops.npu.npu_grouped_matmul( + x=[hidden_states], + weight=[layer.w2_weight], + scale=[layer.w2_weight_scale.to(output_dtype)], + per_token_scale=[swiglu_out_scale], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=output_dtype, + )[0] + return hidden_states From 41d3d3f8500e1362aeff20c51bc8ab80cf3cfe2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 5 Dec 2025 17:54:12 +0300 Subject: [PATCH 011/175] Create w4a8.py --- .../hardware_backend/npu/quantization/w4a8.py | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a8.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py new file mode 100644 index 000000000000..7cd4dc81486a --- /dev/null +++ b/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py @@ -0,0 +1,119 @@ +from __future__ import annotations + +import importlib +import sys +from types import MappingProxyType +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Mapping, + Optional, + Tuple, + Union, + cast, +) + +import torch +from torch.nn.parameter import Parameter + +from sglang.srt.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading +from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig +from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo +from sglang.srt.layers.parameter import ( + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) +from sglang.srt.layers.quantization.base_config import ( + FusedMoEMethodBase, + LinearMethodBase, + QuantizationConfig, + QuantizeMethodBase, +) +from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer +from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod +from sglang.srt.layers.quantization.w8a8_int8 import NPU_W8A8DynamicLinearMethod +from sglang.srt.utils import ( + apply_module_patch, + cpu_has_amx_support, + is_cpu, + is_cuda, + is_npu, + set_weight_attrs, + use_intel_amx_backend, +) + +if TYPE_CHECKING: + from sglang.srt.layers.moe.token_dispatcher import ( + CombineInput, + StandardDispatchOutput, + ) + +_is_cuda = is_cuda() +_is_cpu_amx_available = cpu_has_amx_support() +_is_cpu = is_cpu() +_is_npu = is_npu() + +if _is_npu: + import torch_npu + +class NPU_W4A8DynamicLinearMethod: + """Linear method for NPU W4A8_DYNAMIC.""" + + def __init__(self): + self.transpose_weight = True + try: + self.group_size = self.quantization_config.get("group_size", 256) + except AttributeError: + self.group_size = 256 + + @staticmethod + def process_scale_second(weight: torch.Tensor, scale: torch.Tensor, + per_group_scale: torch.Tensor): + k, n = weight.shape + group_num, n = per_group_scale.shape + weight_high = weight.to(torch.float32).reshape( + group_num, -1, n) * per_group_scale.reshape(group_num, 1, n) + weight_high = weight_high.reshape(k, n) + bias = 8 * (weight_high.to(torch.float32) * scale).sum(dim=0) + antiquant_scale = (scale * per_group_scale).reshape(group_num, n) + return antiquant_scale.npu(), bias + + @staticmethod + def apply( + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + tp_rank: Optional[int] = 0, + ) -> torch.Tensor: + group_size = 256 + return torch_npu.npu_weight_quant_batchmatmul( + x, + layer.weight, + antiquant_scale=layer.weight_scale_second.to(x.dtype), + antiquant_group_size=group_size, + ) + + def process_weights_after_loading(self, layer): + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + + layer.weight_scale.data = layer.weight_scale.data.flatten().to( + torch.float32) + layer.weight_offset.data = layer.weight_offset.data.flatten() + layer.weight_scale_second.data, scale_bias = self.process_scale_second( + layer.weight.data, + layer.weight_scale.data, + layer.weight_scale_second.data.transpose(0, 1).contiguous(), + ) + param = torch.nn.Parameter(scale_bias, requires_grad=False) + layer.register_parameter("weight_scale_bias", param) + layer.weight.data = torch_npu.npu_convert_weight_to_int4pack( + layer.weight.data.to(torch.int32)) From 6d0b035174c98d75abd2323211aa9012e2856c65 Mon Sep 17 00:00:00 2001 From: TamirBaydasov Date: Mon, 8 Dec 2025 05:20:59 +0300 Subject: [PATCH 012/175] Create msmodelslim structure, initial commit --- .../npu/quantization/modelslim.py | 241 ---------- .../quantization/msmodelslim/msmodelslim.py | 426 ++++++++++++++++++ .../msmodelslim/msmodelslim_moe.py | 57 +++ .../msmodelslim/schemes/__init__.py | 0 .../msmodelslim/schemes/msmodelslim_scheme.py | 0 .../schemes/msmodelslim_w8a8_int8.py | 0 .../schemes/msmodelslim_w8a8_int8_moe.py | 0 7 files changed, 483 insertions(+), 241 deletions(-) create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py b/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py index aae78683686c..e69de29bb2d1 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py @@ -1,241 +0,0 @@ -from __future__ import annotations - -from types import MappingProxyType -from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast - -import torch -from compressed_tensors.quantization import QuantizationStrategy - -from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( - NPUW4A8Int4DynamicMoEMethod, - NPUW4A16Int4DynamicMoEMethod, - NPUW8A8Int8DynamicMoEMethod, -) -from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( - NPUW8A8Int8DynamicLinearMethod, - NPUW8A8Int8LinearMethod, -) -from sglang.srt.layers.quantization.base_config import ( - QuantizationConfig, - QuantizeMethodBase, -) -from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import ( - CompressedTensorsConfig, -) -from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer -from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod -from sglang.srt.utils import apply_module_patch - - -# func refers to RMSNorm.__init__ -def npu_wrapper_rmsnorm_init(func): - def init(self, hidden_size: int, **extra_args) -> None: - func(self, hidden_size, **extra_args) - self.ignore_anti = True - # The Ascend w8a8_int8 quantization requires adding a bias in rmsnorm - self.bias = torch.nn.Parameter(torch.zeros(hidden_size), requires_grad=False) - - return init - - -# func refers to RMSNorm.forward_oot -def npu_wrapper_rmsnorm_forward(func): - def _rmsnorm_forward_oot( - self, - x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - from sgl_kernel_npu.norm.add_rmsnorm_bias import add_rmsnorm_bias - - if not x.is_contiguous(): - x = x.contiguous() - if residual is not None: - out, residual_out = add_rmsnorm_bias( - x, - residual, - self.weight.data, - self.bias, - self.variance_epsilon, - ) - return out.to(x.dtype), residual_out - - out = torch.ops.npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0] - out = out + self.bias - return out.to(x.dtype) - - return _rmsnorm_forward_oot - - -class ModelSlimConfig(QuantizationConfig): - """ - Config class for ModelSlim Quantization, a NPU-specific quantization type. - """ - - def __init__(self, quant_config: Dict[str, Any] = {}): - super().__init__() - self.quant_description = quant_config - self.is_dynamic = quant_config.get("is_dynamic", False) - self.is_moe_w4_dynamic = False - ignore = cast(List[str], quant_config.get("ignore", [])) - self.ignore = ignore if ignore is not None else [] - packed_modules_mapping = quant_config.get("packed_modules_mapping", {}) - self.packed_modules_mapping = ( - packed_modules_mapping if packed_modules_mapping is not None else {} - ) - self.target_scheme_map = ( - CompressedTensorsConfig._quantization_scheme_map_from_config( - config=quant_config - ) - ) - target = "MoEGMM" if "MoEGMM" in self.target_scheme_map else "Linear" - target_scheme = self.target_scheme_map.get(target, None) - if target_scheme is None: - self.is_moe_w4_dynamic = False - else: - weight_quant = target_scheme.get("weights") - input_quant = target_scheme.get("input_activations") - self.is_moe_w4_dynamic = self.is_dynamic_token_w4(weight_quant, input_quant) - self.is_moe_input_quant = input_quant - - for name in self.quant_description.keys(): - if "norm.bias" in name: - apply_module_patch( - "sglang.srt.layers.layernorm.RMSNorm", - "__init__", - [npu_wrapper_rmsnorm_init], - ) - apply_module_patch( - "sglang.srt.layers.layernorm.RMSNorm", - "forward_npu", - [npu_wrapper_rmsnorm_forward], - ) - - @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: - return [torch.int8, torch.float16, torch.bfloat16] - - @classmethod - def get_min_capability(cls) -> int: - return 0 - - @classmethod - def get_name(self) -> str: - return "modelslim" - - @classmethod - def get_config_filenames(cls) -> List[str]: - filenames = ["quant_model_description.json"] - return filenames - - @classmethod - def from_config(cls, config: Dict[str, Any]) -> ModelSlimConfig: - return cls(config) - - def get_quant_method( - self, - layer: torch.nn.Module, - prefix: str, - ) -> Optional[QuantizeMethodBase]: - from sglang.srt.layers.linear import LinearBase - from sglang.srt.layers.moe.fused_moe_triton import FusedMoE - - if isinstance(layer, LinearBase): - if should_ignore_layer( - prefix, - ignore=self.ignore, - fused_mapping=self.packed_modules_mapping, - ): - return UnquantizedLinearMethod() - key = "model" - if "vision_model" in prefix: - key = "vision_model" - elif "visual" in prefix: - key = "visual" - packed_modules_mapping_subset = self.packed_modules_mapping.get(key, {}) - prefix_in_quant_config = prefix - proj_name = prefix.split(".")[-1] - if proj_name in packed_modules_mapping_subset: - prefix_in_quant_config = prefix.replace( - proj_name, packed_modules_mapping_subset[proj_name][0] - ) - self.is_dynamic = ( - self.quant_description[prefix_in_quant_config + ".weight"] - == "W8A8_DYNAMIC" - ) - if self.is_layer_skipped(prefix, packed_modules_mapping_subset): - return UnquantizedLinearMethod() - return ( - NPUW8A8Int8DynamicLinearMethod(self) - if self.is_dynamic - else NPUW8A8Int8LinearMethod(self) - ) - elif isinstance(layer, FusedMoE): - prefix_in_quant_config = prefix + ".0.down_proj.weight" - is_moe_w4a8_dynamic = ( - self.quant_description.get(prefix_in_quant_config, "STATIC") - == "W4A8_DYNAMIC" - ) - if ( - self.is_moe_w4_dynamic and self.is_moe_input_quant is not None - ) or is_moe_w4a8_dynamic: - return NPUW4A8Int4DynamicMoEMethod() - elif self.is_moe_w4_dynamic and self.is_moe_input_quant is None: - return NPUW4A16Int4DynamicMoEMethod(self) - else: - return NPUW8A8Int8DynamicMoEMethod() - return None - - def is_layer_skipped( - self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) - ): - # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped - proj_name = prefix.split(".")[-1] - if proj_name in fused_mapping: - shard_prefixes = [ - prefix.replace(proj_name, shard_proj_name) - for shard_proj_name in fused_mapping[proj_name] - ] - - is_skipped = None - for shard_prefix in shard_prefixes: - is_shard_skipped = ( - self.quant_description[shard_prefix + ".weight"] == "FLOAT" - ) - - if is_skipped is None: - is_skipped = is_shard_skipped - elif is_shard_skipped != is_skipped: - raise ValueError( - f"Detected some but not all shards of {prefix} " - "are quantized. All shards of fused layers " - "to have the same precision." - ) - else: - is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT" - - assert is_skipped is not None - return is_skipped - - def get_scaled_act_names(self) -> List[str]: - return [] - - def is_dynamic_token_w4(self, weight_quant, input_quant) -> bool: - is_w4 = weight_quant.num_bits == 4 - weight_strategy = ( - weight_quant.strategy == QuantizationStrategy.TENSOR.value - or weight_quant.strategy == QuantizationStrategy.CHANNEL.value - or weight_quant.strategy == QuantizationStrategy.GROUP.value - ) - if input_quant is not None: - is_token = ( - weight_strategy - and input_quant.strategy == QuantizationStrategy.TOKEN.value - ) - is_dynamic = not weight_quant.dynamic and input_quant.dynamic - else: - is_token = weight_strategy - is_dynamic = not weight_quant.dynamic - - # Both symmetric and asymmetric input quantization supported. - # Only symmetric weight quantization supported. - return is_w4 and weight_quant.symmetric and is_token and is_dynamic diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py new file mode 100644 index 000000000000..0f302d3565ae --- /dev/null +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -0,0 +1,426 @@ +from __future__ import annotations + +import logging +from types import MappingProxyType +from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast + +import torch +from compressed_tensors.quantization import QuantizationStrategy +from pydantic import BaseModel + +# from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( +# NPUW4A8Int4DynamicMoEMethod, +# NPUW4A16Int4DynamicMoEMethod, +# NPUW8A8Int8DynamicMoEMethod, +# ) +from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( + _NPULinearMethodBase + # NPUW8A8Int8DynamicLinearMethod, + # NPUW8A8Int8LinearMethod, +) +from sglang.srt.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import ( + ModelSlimMoEMethod, +) +from sglang.srt.layers.quantization.compressed_tensors.utils import ( + find_matched_target, + is_activation_quantization_format, + should_ignore_layer +) +#from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer +from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod +from sglang.srt.utils import apply_module_patch + +logger = logging.getLogger(__name__) + +# func refers to RMSNorm.__init__ +def npu_wrapper_rmsnorm_init(func): + def init(self, hidden_size: int, **extra_args) -> None: + func(self, hidden_size, **extra_args) + self.ignore_anti = True + # The Ascend w8a8_int8 quantization requires adding a bias in rmsnorm + self.bias = torch.nn.Parameter(torch.zeros(hidden_size), requires_grad=False) + + return init + +# func refers to RMSNorm.forward_oot +def npu_wrapper_rmsnorm_forward(func): + def _rmsnorm_forward_oot( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + from sgl_kernel_npu.norm.add_rmsnorm_bias import add_rmsnorm_bias + + if not x.is_contiguous(): + x = x.contiguous() + if residual is not None: + out, residual_out = add_rmsnorm_bias( + x, + residual, + self.weight.data, + self.bias, + self.variance_epsilon, + ) + return out.to(x.dtype), residual_out + + out = torch.ops.npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0] + out = out + self.bias + return out.to(x.dtype) + + return _rmsnorm_forward_oot + + +class ModelSlimConfig(QuantizationConfig): + """ + Config class for ModelSlim Quantization, a NPU-specific quantization type. + """ + + def __init__(self, quant_config: Dict[str, Any] = {}): + super().__init__() + self.quant_description = quant_config + # self.is_dynamic = quant_config.get("is_dynamic", False) + # self.is_moe_w4_dynamic = False + ignore = cast(List[str], quant_config.get("ignore", [])) + self.ignore = ignore if ignore is not None else [] + packed_modules_mapping = quant_config.get("packed_modules_mapping", {}) + self.packed_modules_mapping = ( + packed_modules_mapping if packed_modules_mapping is not None else {} + ) + # self.target_scheme_map = ( + # CompressedTensorsConfig._quantization_scheme_map_from_config( + # config=quant_config + # ) + # ) + # target = "MoEGMM" if "MoEGMM" in self.target_scheme_map else "Linear" + # target_scheme = self.target_scheme_map.get(target, None) + # if target_scheme is None: + # self.is_moe_w4_dynamic = False + # else: + # weight_quant = target_scheme.get("weights") + # input_quant = target_scheme.get("input_activations") + # self.is_moe_w4_dynamic = self.is_dynamic_token_w4(weight_quant, input_quant) + # self.is_moe_input_quant = input_quant + + for name in self.quant_description.keys(): + if "norm.bias" in name: + apply_module_patch( + "sglang.srt.layers.layernorm.RMSNorm", + "__init__", + [npu_wrapper_rmsnorm_init], + ) + apply_module_patch( + "sglang.srt.layers.layernorm.RMSNorm", + "forward_npu", + [npu_wrapper_rmsnorm_forward], + ) + + def get_linear_method(self) -> ModelSlimLinearMethod: + return ModelSlimLinearMethod(self) + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.int8, torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 0 + + @classmethod + def get_name(self) -> str: + return "modelslim" + + @classmethod + def get_config_filenames(cls) -> List[str]: + filenames = ["quant_model_description.json"] + return filenames + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> ModelSlimConfig: + return cls(config) + + def get_quant_method( + self, + layer: torch.nn.Module, + prefix: str, + ) -> Optional[QuantizeMethodBase]: + from sglang.srt.layers.linear import LinearBase + from sglang.srt.layers.moe.fused_moe_triton import FusedMoE + + if isinstance(layer, LinearBase): + if should_ignore_layer( + prefix, + ignore=self.ignore, + fused_mapping=self.packed_modules_mapping, + ): + return UnquantizedLinearMethod() + key = "model" + if "vision_model" in prefix: + key = "vision_model" + elif "visual" in prefix: + key = "visual" + packed_modules_mapping_subset = self.packed_modules_mapping.get(key, {}) + prefix_in_quant_config = prefix + proj_name = prefix.split(".")[-1] + if proj_name in packed_modules_mapping_subset: + prefix_in_quant_config = prefix.replace( + proj_name, packed_modules_mapping_subset[proj_name][0] + ) + # self.is_dynamic = ( + # self.quant_description[prefix_in_quant_config + ".weight"] + # == "W8A8_DYNAMIC" + # ) + + if self.is_layer_skipped(prefix, packed_modules_mapping_subset): + return UnquantizedLinearMethod() + scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config) + if scheme is None: + return UnquantizedLinearMethod() + layer.scheme = scheme + return ( + ModelSlimLinearMethod(self) + ) + elif isinstance(layer, FusedMoE): + return ModelSlimMoeMethod.get_moe_method(self, layer, prefix) + return None + + def _get_scheme_from_parts( + self, weight_quant: BaseModel, input_quant: BaseModel + ) -> ModelSlimScheme: + + # Detect If Mixed Precision + # if self._is_wNa16_group_channel(weight_quant, input_quant): + # if ( + # self.quant_format == CompressionFormat.pack_quantized.value + # and weight_quant.num_bits in WNA16_SUPPORTED_BITS + # ): + # return CompressedTensorsWNA16( + # num_bits=weight_quant.num_bits, + # strategy=weight_quant.strategy, + # group_size=weight_quant.group_size, + # actorder=weight_quant.actorder, + # ) + # else: + # raise ImportError( + # "Other method (CompressedTensorsW4A16Sparse24) is not supported now" + # ) + + if is_activation_quantization_format(self.quant_format): + # if self._is_fp8_w8a8(weight_quant, input_quant): + # is_fp8_w8a8_supported = self._check_scheme_supported( + # CompressedTensorsW8A8Fp8.get_min_capability(), error=False + # ) + # if is_fp8_w8a8_supported: + # return CompressedTensorsW8A8Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=( + # input_quant and not input_quant.dynamic + # ), + # ) + # else: + # # note: input_quant will be present for converted models; + # # will be ignored during inference post loading + # return CompressedTensorsW8A16Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=not input_quant.dynamic, + # ) + + # # note: input_quant can be None + # if self._is_fp8_w8a16(weight_quant, input_quant): + # is_static_input_scheme = input_quant and not input_quant.dynamic + # return CompressedTensorsW8A16Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=is_static_input_scheme, + # ) + + if self._is_static_tensor_w8a8(weight_quant, input_quant): + return ModelSlimW8A8Int8( + strategy=weight_quant.strategy, + is_static_input_scheme=True, + input_symmetric=input_quant.symmetric, + ) + + if self._is_dynamic_token_w8a8(weight_quant, input_quant): + return ModelSlimW8A8Int8( + strategy=weight_quant.strategy, + is_static_input_scheme=False, + input_symmetric=input_quant.symmetric, + ) + + raise NotImplementedError("No msmodelslim compatible scheme was found.") + + def get_scheme( + self, layer: torch.nn.Module, layer_name: Optional[str] = None + ) -> Optional[ModelSlimScheme]: + """ + get_scheme method adjusted for modelslim, taken from + python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py + """ + if self.target_scheme_map: + matched_target = find_matched_target( + layer_name=layer_name, + module=layer, + targets=self.target_scheme_map.keys(), + fused_mapping=self.packed_modules_mapping, + ) + + scheme_dict = self.target_scheme_map[matched_target] + weight_quant = scheme_dict.get("weights") + input_quant = scheme_dict.get("input_activations") + else: + # Find the quant_scheme + scheme = self._get_scheme_from_parts( # type: ignore + weight_quant=weight_quant, + input_quant=input_quant, + ) + + # Ascend doesn't support device capability + # self._check_scheme_supported(scheme.get_min_capability()) + logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name) + return scheme + + def is_layer_skipped( + self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) + ): + # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped + proj_name = prefix.split(".")[-1] + if proj_name in fused_mapping: + shard_prefixes = [ + prefix.replace(proj_name, shard_proj_name) + for shard_proj_name in fused_mapping[proj_name] + ] + + is_skipped = None + for shard_prefix in shard_prefixes: + is_shard_skipped = ( + self.quant_description[shard_prefix + ".weight"] == "FLOAT" + ) + + if is_skipped is None: + is_skipped = is_shard_skipped + elif is_shard_skipped != is_skipped: + raise ValueError( + f"Detected some but not all shards of {prefix} " + "are quantized. All shards of fused layers " + "to have the same precision." + ) + else: + is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT" + + assert is_skipped is not None + return is_skipped + + def get_scaled_act_names(self) -> List[str]: + return [] + + def is_dynamic_token_w4(self, weight_quant, input_quant) -> bool: + is_w4 = weight_quant.num_bits == 4 + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.TENSOR.value + or weight_quant.strategy == QuantizationStrategy.CHANNEL.value + or weight_quant.strategy == QuantizationStrategy.GROUP.value + ) + if input_quant is not None: + is_token = ( + weight_strategy + and input_quant.strategy == QuantizationStrategy.TOKEN.value + ) + is_dynamic = not weight_quant.dynamic and input_quant.dynamic + else: + is_token = weight_strategy + is_dynamic = not weight_quant.dynamic + + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_w4 and weight_quant.symmetric and is_token and is_dynamic + + def _is_static_tensor_w8a8( + self, weight_quant: BaseModel, input_quant: BaseModel + ) -> bool: + is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.TENSOR.value + or weight_quant.strategy == QuantizationStrategy.CHANNEL.value + ) + is_tensor = ( + weight_strategy + and input_quant.strategy == QuantizationStrategy.TENSOR.value + ) + is_static = not weight_quant.dynamic and not input_quant.dynamic + + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_8_bits and is_tensor and weight_quant.symmetric and is_static + + def _is_dynamic_token_w8a8( + self, weight_quant: BaseModel, input_quant: BaseModel + ) -> bool: + is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.TENSOR.value + or weight_quant.strategy == QuantizationStrategy.CHANNEL.value + ) + is_token = ( + weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value + ) + is_dynamic = not weight_quant.dynamic and input_quant.dynamic + + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_8_bits and is_token and weight_quant.symmetric and is_dynamic + + +class ModelSlimLinearMethod(_NPULinearMethodBase): + + def __init__(self, quantization_config: ModelSlimConfig): + self.quantization_config = quantization_config + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.scheme.process_weights_after_loading(layer) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + """ + Use the ModelSlimScheme associated with each layer to create + the necessary parameters for the layer. See LinearMethodBase for param + details + """ + weight_loader = extra_weight_attrs.get("weight_loader") + layer.scheme.create_weights( + layer=layer, + input_size=input_size, + input_size_per_partition=input_size_per_partition, + output_partition_sizes=output_partition_sizes, + output_size=output_size, + params_dtype=params_dtype, + weight_loader=weight_loader, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ): + """ + Use the output of create_weights and the CompressedTensorsScheme + associated with the layer to apply the forward pass with the + layer input. See LinearMethodBase for param details + + """ + + scheme = layer.scheme + if scheme is None: + raise ValueError("A scheme must be defined for each layer") + return scheme.apply_weights(layer, x, bias=bias) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py new file mode 100644 index 000000000000..bee981b3d3b1 --- /dev/null +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -0,0 +1,57 @@ +# Adapted from https://github.com/vllm-project/vllm/tree/v0.8.2/vllm/model_executor/layers/quantization/compressed_tensors +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import enum +import logging +from enum import Enum +from typing import TYPE_CHECKING + +import torch +from compressed_tensors import CompressionFormat +from compressed_tensors.quantization import QuantizationStrategy + +from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig +from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo +from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase + +logger = logging.getLogger(__name__) + + +__all__ = [ + "ModelSlimMoEMethod", +] + + +class ModelSlimMoEMethod(FusedMoEMethodBase): + def __new__(cls, *args, **kwargs): + if cls is ModelSlimMoEMethod: + return super().__new__(cls) + return super().__new__(cls) + + @staticmethod + def get_moe_method( + quant_config: ModelSlimConfig, + layer: torch.nn.Module, + prefix: str, + ) -> "ModelSlimMoEMethod": + # TODO: @dsikka: refactor this to use schemes as other kernels + # are supported + check if the layer is being ignored. + + weight_quant = quant_config.target_scheme_map["Linear"].get("weights") + input_quant = quant_config.target_scheme_map["Linear"].get("input_activations") + is_moe_w4_dynamic = quant_config.is_dynamic_token_w4(weight_quant, input_quant) + is_moe_input_quant = input_quant + + if ( + is_moe_w4_dynamic and is_moe_input_quant is not None + ) or quant_config._is_moe_w4a8_dynamic(prefix, weight_quant, input_quant): + return NPUW4A8Int4DynamicMoEMethod(quant_config) + elif is_moe_w4_dynamic and is_moe_input_quant is None: + return NPUW4A16Int4DynamicMoEMethod(quant_config) + else: + return NPUW8A8Int8DynamicMoEMethod(quant_config) + # else: + # raise RuntimeError( + # f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}" + # ) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py new file mode 100644 index 000000000000..e69de29bb2d1 From 66c7517f0288e10f6322566e3cb7d163e09d32d1 Mon Sep 17 00:00:00 2001 From: TamirBaydasov Date: Wed, 10 Dec 2025 15:57:26 +0300 Subject: [PATCH 013/175] Working msmodelslim structure, W8A8, W8A8 MoE, W4A4 --- python/sglang/srt/configs/model_config.py | 2 +- .../npu/quantization/fused_moe_method_npu.py | 113 +---------- .../npu/quantization/linear_method_npu.py | 159 +++------------ .../hardware_backend/npu/quantization/w4a4.py | 42 ---- .../srt/layers/quantization/__init__.py | 2 +- .../quantization/msmodelslim/msmodelslim.py | 182 +++++++++-------- .../msmodelslim/msmodelslim_moe.py | 191 ++++++++++++++++-- .../msmodelslim/schemes/__init__.py | 11 + .../msmodelslim/schemes/msmodelslim_scheme.py | 56 +++++ .../schemes/msmodelslim_w4a4_int4.py | 95 +++++++++ .../schemes/msmodelslim_w8a8_int8.py | 140 +++++++++++++ .../schemes/msmodelslim_w8a8_int8_moe.py | 0 12 files changed, 597 insertions(+), 396 deletions(-) delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a4.py create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py delete mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index e4af64b1a116..100ebf48c7ca 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -26,7 +26,7 @@ from sglang.srt.environ import envs from sglang.srt.layers.quantization import QUANTIZATION_METHODS from sglang.srt.server_args import ServerArgs -from sglang.srt.utils import is_hip, retry +from sglang.srt.utils import is_hip, retry, is_npu from sglang.srt.utils.hf_transformers_utils import ( get_config, get_context_length, diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 938314b0f425..41991a5e6a4a 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -141,115 +141,8 @@ def npu_fused_moe_without_routing_weights_bf16( class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase): - - def create_weights( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ) -> None: - from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported - - self.num_experts = num_experts - extra_weight_attrs.update( - {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} - ) - - # weight - w13_weight = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size, - dtype=torch.int8, - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight", w13_weight) - set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition, - dtype=torch.int8, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight", w2_weight) - set_weight_attrs(w2_weight, extra_weight_attrs) - # scale - w13_weight_scale = torch.nn.Parameter( - torch.empty( - num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight_scale", w13_weight_scale) - set_weight_attrs(w13_weight_scale, extra_weight_attrs) - w2_weight_scale = torch.nn.Parameter( - torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), - requires_grad=False, - ) - layer.register_parameter("w2_weight_scale", w2_weight_scale) - set_weight_attrs(w2_weight_scale, extra_weight_attrs) - # offset - w13_weight_offset = torch.nn.Parameter( - torch.empty( - num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight_offset", w13_weight_offset) - set_weight_attrs(w13_weight_offset, extra_weight_attrs) - w2_weight_offset = torch.nn.Parameter( - torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), - requires_grad=False, - ) - layer.register_parameter("w2_weight_offset", w2_weight_offset) - set_weight_attrs(w2_weight_offset, extra_weight_attrs) - - def release_weight_cache(self, weight: torch.Tensor): - # .contiguous() introduces additional memory overhead and needs to be released using resize_(0) - origin_weight = weight.data.transpose(1, 2) - new_weight = origin_weight.contiguous() - origin_weight.untyped_storage().resize_(0) - return new_weight - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - weight_data = self.release_weight_cache(layer.w13_weight.data) - layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False) - - weight_data = self.release_weight_cache(layer.w2_weight.data) - layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False) - - layer.w13_weight_scale = torch.nn.Parameter( - layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32), - requires_grad=False, - ) - layer.w2_weight_scale = torch.nn.Parameter( - layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False - ) - layer.w13_weight_offset = torch.nn.Parameter( - layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False - ) - layer.w2_weight_offset = torch.nn.Parameter( - layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False - ) - - layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) - layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) - - def create_moe_runner( - self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" - ): - self.moe_runner_config = moe_runner_config - + @staticmethod def apply( - self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": @@ -272,9 +165,9 @@ def apply( top_k=topk_ids.shape[1], ) return StandardCombineInput(hidden_states=output) - + + @staticmethod def apply_without_routing_weights( - self, layer, hidden_states, hidden_states_scale, diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py index 46db893b3495..7d61255e17e6 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py @@ -25,80 +25,9 @@ def __init__( class NPUW8A8Int8LinearMethod(_NPULinearMethodBase): - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: List[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - weight_loader = extra_weight_attrs.get("weight_loader") - output_size_per_partition = sum(output_partition_sizes) - - weight = ModelWeightParameter( - data=torch.empty( - (output_size_per_partition, input_size_per_partition), dtype=torch.int8 - ), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight", weight) - - weight_scale = ChannelQuantScaleParameter( - data=torch.empty((output_size_per_partition, 1), dtype=params_dtype), - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight_scale", weight_scale) - - weight_offset = ChannelQuantScaleParameter( - data=torch.empty((output_size_per_partition, 1), dtype=params_dtype), - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight_offset", weight_offset) - - input_scale = PerTensorScaleParameter( - data=torch.empty(1, dtype=params_dtype), - weight_loader=weight_loader, - ) - input_scale.ignore_warning = True - layer.register_parameter("input_scale", input_scale) - - input_offset = PerTensorScaleParameter( - data=torch.empty(1, dtype=params_dtype), - weight_loader=weight_loader, - ) - input_offset.ignore_warning = True - layer.register_parameter("input_offset", input_offset) - - quant_bias = ChannelQuantScaleParameter( - data=torch.empty(output_size_per_partition, dtype=torch.int32), - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("quant_bias", quant_bias) - - if params_dtype == torch.bfloat16: - deq_scale_dtype = torch.float32 - elif params_dtype == torch.float16: - deq_scale_dtype = torch.int64 - else: - raise ValueError(f"Unsupported params_dtype: {params_dtype}") - deq_scale = ChannelQuantScaleParameter( - data=torch.empty(output_size_per_partition, dtype=deq_scale_dtype), - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("deq_scale", deq_scale) - + + @staticmethod def apply( - self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None, @@ -129,75 +58,40 @@ def apply( output_dtype=original_dtype, ) - def process_weights_after_loading(self, layer: torch.nn.Module): - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - layer.weight.data = npu_format_cast(layer.weight.data) - - layer.weight_scale.data = torch.flatten(layer.weight_scale.data) - layer.weight_offset.data = torch.flatten(layer.weight_offset.data) - - expanding_factor = layer.weight.data.shape[0] - layer.aclnn_input_scale = torch.nn.Parameter( - layer.input_scale.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) - layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter( - layer.input_scale.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) - layer.aclnn_input_offset = torch.nn.Parameter( - layer.input_offset.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) - class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase): - def create_weights( - self, + @staticmethod + def apply( layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: List[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - weight_loader = extra_weight_attrs.get("weight_loader") - output_size_per_partition = sum(output_partition_sizes) - - weight = ModelWeightParameter( - data=torch.empty( - (output_size_per_partition, input_size_per_partition), dtype=torch.int8 - ), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + original_dtype = x.dtype + quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant(x) + return torch.ops.npu.npu_quant_matmul( + quant_out, + layer.weight, + layer.weight_scale, + pertoken_scale=dynamic_scale, + bias=bias, + output_dtype=original_dtype, ) - layer.register_parameter("weight", weight) - weight_scale = ChannelQuantScaleParameter( - data=torch.empty((output_size_per_partition, 1), dtype=params_dtype), - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight_scale", weight_scale) - weight_offset = ChannelQuantScaleParameter( - data=torch.empty((output_size_per_partition, 1), dtype=params_dtype), - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight_offset", weight_offset) +class NPU_W4A4DynamicLinearMethod(_NPULinearMethodBase): + @staticmethod def apply( - self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None, + tp_rank: Optional[int] = 0, ) -> torch.Tensor: original_dtype = x.dtype - quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant(x) + quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant( + x, dst_type=torch.quint4x2 + ) return torch.ops.npu.npu_quant_matmul( quant_out, layer.weight, @@ -205,11 +99,4 @@ def apply( pertoken_scale=dynamic_scale, bias=bias, output_dtype=original_dtype, - ) - - def process_weights_after_loading(self, layer: torch.nn.Module): - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - layer.weight.data = npu_format_cast(layer.weight.data) - - layer.weight_scale.data = layer.weight_scale.data.flatten() - layer.weight_offset.data = layer.weight_offset.data.flatten() + ) \ No newline at end of file diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py deleted file mode 100644 index 4676b4655872..000000000000 --- a/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py +++ /dev/null @@ -1,42 +0,0 @@ -from typing import TYPE_CHECKING, List, Optional - -import torch - -from sglang.srt.hardware_backend.npu.utils import npu_format_cast -from sglang.srt.hardware_backend.npu.quantization.utils import _NPULinearMethodBase - -class NPU_W4A4DynamicLinearMethodImpl: - """Linear method for NPU W4A4_DYNAMIC.""" - - def __init__(self): - self.transpose_weight = True - - @staticmethod - def apply( - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - tp_rank: Optional[int] = 0, - ) -> torch.Tensor: - original_dtype = x.dtype - quant_out, dynamic_scale = torch_npu.npu_dynamic_quant( - x, dst_type=torch.quint4x2 - ) - return torch_npu.npu_quant_matmul( - quant_out, - layer.weight, - layer.weight_scale, - pertoken_scale=dynamic_scale, - bias=bias, - output_dtype=original_dtype, - ) - - def process_weights_after_loading(self, layer): - if self.transpose_weight: - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - layer.weight_scale.data = layer.weight_scale.data.flatten() - layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32) - layer.weight_offset.data = layer.weight_offset.data.flatten() - layer.weight.data = torch_npu.npu_convert_weight_to_int4pack( - layer.weight.data.to(torch.int32) - ) diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index f4ec7d8c46a4..4aa1843a4d85 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -79,7 +79,7 @@ def override_quantization_method(self, *args, **kwargs): ) if is_npu(): - from sglang.srt.hardware_backend.npu.quantization.modelslim import ModelSlimConfig + from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig BASE_QUANTIZATION_METHODS.update( { diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index 0f302d3565ae..b28893f0f42e 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -25,6 +25,11 @@ from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import ( ModelSlimMoEMethod, ) +from sglang.srt.layers.quantization.msmodelslim.schemes import ( + ModelSlimScheme, + ModelSlimW8A8Int8, + ModelSlimW4A4Int4, +) from sglang.srt.layers.quantization.compressed_tensors.utils import ( find_matched_target, is_activation_quantization_format, @@ -130,7 +135,7 @@ def get_min_capability(cls) -> int: return 0 @classmethod - def get_name(self) -> str: + def get_name(cls) -> str: return "modelslim" @classmethod @@ -188,9 +193,21 @@ def get_quant_method( return None def _get_scheme_from_parts( - self, weight_quant: BaseModel, input_quant: BaseModel + self, layer_name: str, ) -> ModelSlimScheme: + quant_type = self.quant_description[layer_name + '.weight'] + if quant_type == "W8A8_DYNAMIC": + return ModelSlimW8A8Int8( + quant_config=self.quant_description, + prefix=layer_name + ) + elif quant_type == "W4A4_DYNAMIC": + return ModelSlimW4A4Int4( + quant_config=self.quant_description, + prefix=layer_name + ) + # Detect If Mixed Precision # if self._is_wNa16_group_channel(weight_quant, input_quant): # if ( @@ -208,7 +225,7 @@ def _get_scheme_from_parts( # "Other method (CompressedTensorsW4A16Sparse24) is not supported now" # ) - if is_activation_quantization_format(self.quant_format): + #if is_activation_quantization_format(self.quant_format): # if self._is_fp8_w8a8(weight_quant, input_quant): # is_fp8_w8a8_supported = self._check_scheme_supported( # CompressedTensorsW8A8Fp8.get_min_capability(), error=False @@ -236,21 +253,7 @@ def _get_scheme_from_parts( # is_static_input_scheme=is_static_input_scheme, # ) - if self._is_static_tensor_w8a8(weight_quant, input_quant): - return ModelSlimW8A8Int8( - strategy=weight_quant.strategy, - is_static_input_scheme=True, - input_symmetric=input_quant.symmetric, - ) - - if self._is_dynamic_token_w8a8(weight_quant, input_quant): - return ModelSlimW8A8Int8( - strategy=weight_quant.strategy, - is_static_input_scheme=False, - input_symmetric=input_quant.symmetric, - ) - - raise NotImplementedError("No msmodelslim compatible scheme was found.") + #raise NotImplementedError("No msmodelslim compatible scheme was found.") def get_scheme( self, layer: torch.nn.Module, layer_name: Optional[str] = None @@ -259,23 +262,24 @@ def get_scheme( get_scheme method adjusted for modelslim, taken from python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py """ - if self.target_scheme_map: - matched_target = find_matched_target( - layer_name=layer_name, - module=layer, - targets=self.target_scheme_map.keys(), - fused_mapping=self.packed_modules_mapping, - ) - - scheme_dict = self.target_scheme_map[matched_target] - weight_quant = scheme_dict.get("weights") - input_quant = scheme_dict.get("input_activations") - else: + # if self.target_scheme_map: + # matched_target = find_matched_target( + # layer_name=layer_name, + # module=layer, + # targets=self.target_scheme_map.keys(), + # fused_mapping=self.packed_modules_mapping, + # ) + + # scheme_dict = self.target_scheme_map[matched_target] + # weight_quant = scheme_dict.get("weights") + # input_quant = scheme_dict.get("input_activations") + # else: # Find the quant_scheme - scheme = self._get_scheme_from_parts( # type: ignore - weight_quant=weight_quant, - input_quant=input_quant, - ) + scheme = self._get_scheme_from_parts( # type: ignore + # weight_quant=weight_quant, + # input_quant=input_quant, + layer_name=layer_name, + ) # Ascend doesn't support device capability # self._check_scheme_supported(scheme.get_min_capability()) @@ -316,61 +320,61 @@ def is_layer_skipped( def get_scaled_act_names(self) -> List[str]: return [] - def is_dynamic_token_w4(self, weight_quant, input_quant) -> bool: - is_w4 = weight_quant.num_bits == 4 - weight_strategy = ( - weight_quant.strategy == QuantizationStrategy.TENSOR.value - or weight_quant.strategy == QuantizationStrategy.CHANNEL.value - or weight_quant.strategy == QuantizationStrategy.GROUP.value - ) - if input_quant is not None: - is_token = ( - weight_strategy - and input_quant.strategy == QuantizationStrategy.TOKEN.value - ) - is_dynamic = not weight_quant.dynamic and input_quant.dynamic - else: - is_token = weight_strategy - is_dynamic = not weight_quant.dynamic - - # Both symmetric and asymmetric input quantization supported. - # Only symmetric weight quantization supported. - return is_w4 and weight_quant.symmetric and is_token and is_dynamic - - def _is_static_tensor_w8a8( - self, weight_quant: BaseModel, input_quant: BaseModel - ) -> bool: - is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 - weight_strategy = ( - weight_quant.strategy == QuantizationStrategy.TENSOR.value - or weight_quant.strategy == QuantizationStrategy.CHANNEL.value - ) - is_tensor = ( - weight_strategy - and input_quant.strategy == QuantizationStrategy.TENSOR.value - ) - is_static = not weight_quant.dynamic and not input_quant.dynamic - - # Both symmetric and asymmetric input quantization supported. - # Only symmetric weight quantization supported. - return is_8_bits and is_tensor and weight_quant.symmetric and is_static - - def _is_dynamic_token_w8a8( - self, weight_quant: BaseModel, input_quant: BaseModel - ) -> bool: - is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 - weight_strategy = ( - weight_quant.strategy == QuantizationStrategy.TENSOR.value - or weight_quant.strategy == QuantizationStrategy.CHANNEL.value - ) - is_token = ( - weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value - ) - is_dynamic = not weight_quant.dynamic and input_quant.dynamic - - # Both symmetric and asymmetric input quantization supported. - # Only symmetric weight quantization supported. - return is_8_bits and is_token and weight_quant.symmetric and is_dynamic + # def is_dynamic_token_w4(self, weight_quant, input_quant) -> bool: + # is_w4 = weight_quant.num_bits == 4 + # weight_strategy = ( + # weight_quant.strategy == QuantizationStrategy.TENSOR.value + # or weight_quant.strategy == QuantizationStrategy.CHANNEL.value + # or weight_quant.strategy == QuantizationStrategy.GROUP.value + # ) + # if input_quant is not None: + # is_token = ( + # weight_strategy + # and input_quant.strategy == QuantizationStrategy.TOKEN.value + # ) + # is_dynamic = not weight_quant.dynamic and input_quant.dynamic + # else: + # is_token = weight_strategy + # is_dynamic = not weight_quant.dynamic + + # # Both symmetric and asymmetric input quantization supported. + # # Only symmetric weight quantization supported. + # return is_w4 and weight_quant.symmetric and is_token and is_dynamic + + # def _is_static_tensor_w8a8( + # self, weight_quant: BaseModel, input_quant: BaseModel + # ) -> bool: + # is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 + # weight_strategy = ( + # weight_quant.strategy == QuantizationStrategy.TENSOR.value + # or weight_quant.strategy == QuantizationStrategy.CHANNEL.value + # ) + # is_tensor = ( + # weight_strategy + # and input_quant.strategy == QuantizationStrategy.TENSOR.value + # ) + # is_static = not weight_quant.dynamic and not input_quant.dynamic + + # # Both symmetric and asymmetric input quantization supported. + # # Only symmetric weight quantization supported. + # return is_8_bits and is_tensor and weight_quant.symmetric and is_static + + # def _is_dynamic_token_w8a8( + # self, weight_quant: BaseModel, input_quant: BaseModel + # ) -> bool: + # is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 + # weight_strategy = ( + # weight_quant.strategy == QuantizationStrategy.TENSOR.value + # or weight_quant.strategy == QuantizationStrategy.CHANNEL.value + # ) + # is_token = ( + # weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value + # ) + # is_dynamic = not weight_quant.dynamic and input_quant.dynamic + + # # Both symmetric and asymmetric input quantization supported. + # # Only symmetric weight quantization supported. + # return is_8_bits and is_token and weight_quant.symmetric and is_dynamic class ModelSlimLinearMethod(_NPULinearMethodBase): diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index bee981b3d3b1..5dd239a6d1ab 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -5,21 +5,39 @@ import enum import logging from enum import Enum -from typing import TYPE_CHECKING +from typing import Callable, Optional, TYPE_CHECKING +from typing import Any, Dict, List import torch -from compressed_tensors import CompressionFormat -from compressed_tensors.quantization import QuantizationStrategy -from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase +from sglang.srt.layers.quantization.msmodelslim.schemes import ( + ModelSlimScheme, +) +from sglang.srt.hardware_backend.npu.utils import npu_format_cast +from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( + NPUW8A8Int8DynamicMoEMethod, +) + +from sglang.srt.utils import set_weight_attrs + +if TYPE_CHECKING: + from sglang.srt.layers.moe import MoeRunnerConfig + from sglang.srt.layers.moe.token_dispatcher import ( + CombineInput, + StandardDispatchOutput, + ) + from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ( + ModelSlimConfig, + ) logger = logging.getLogger(__name__) __all__ = [ "ModelSlimMoEMethod", + "ModelSlimW8A8Int8MoE", ] @@ -38,20 +56,159 @@ def get_moe_method( # TODO: @dsikka: refactor this to use schemes as other kernels # are supported + check if the layer is being ignored. - weight_quant = quant_config.target_scheme_map["Linear"].get("weights") - input_quant = quant_config.target_scheme_map["Linear"].get("input_activations") - is_moe_w4_dynamic = quant_config.is_dynamic_token_w4(weight_quant, input_quant) - is_moe_input_quant = input_quant - - if ( - is_moe_w4_dynamic and is_moe_input_quant is not None - ) or quant_config._is_moe_w4a8_dynamic(prefix, weight_quant, input_quant): - return NPUW4A8Int4DynamicMoEMethod(quant_config) - elif is_moe_w4_dynamic and is_moe_input_quant is None: - return NPUW4A16Int4DynamicMoEMethod(quant_config) - else: - return NPUW8A8Int8DynamicMoEMethod(quant_config) + return ModelSlimW8A8Int8MoE(quant_config) + # weight_quant = quant_config.target_scheme_map["Linear"].get("weights") + # input_quant = quant_config.target_scheme_map["Linear"].get("input_activations") + # is_moe_w4_dynamic = quant_config.is_dynamic_token_w4(weight_quant, input_quant) + # is_moe_input_quant = input_quant + + # if ( + # is_moe_w4_dynamic and is_moe_input_quant is not None + # ) or quant_config._is_moe_w4a8_dynamic(prefix, weight_quant, input_quant): + # return NPUW4A8Int4DynamicMoEMethod(quant_config) + # elif is_moe_w4_dynamic and is_moe_input_quant is None: + # return NPUW4A16Int4DynamicMoEMethod(quant_config) + # else: + # return NPUW8A8Int8DynamicMoEMethod(quant_config) # else: # raise RuntimeError( # f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}" # ) + + +class ModelSlimW8A8Int8MoE(ModelSlimMoEMethod): + + def __init__( + self, quant_config: Dict[str, Any], prefix: str = None, + ): + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported + + self.num_experts = num_experts + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} + ) + + # weight + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=torch.int8, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=torch.int8, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + # scale + w13_weight_scale = torch.nn.Parameter( + torch.empty( + num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + w2_weight_scale = torch.nn.Parameter( + torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + # offset + w13_weight_offset = torch.nn.Parameter( + torch.empty( + num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_offset", w13_weight_offset) + set_weight_attrs(w13_weight_offset, extra_weight_attrs) + w2_weight_offset = torch.nn.Parameter( + torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_weight_offset", w2_weight_offset) + set_weight_attrs(w2_weight_offset, extra_weight_attrs) + + def release_weight_cache(self, weight: torch.Tensor): + # .contiguous() introduces additional memory overhead and needs to be released using resize_(0) + origin_weight = weight.data.transpose(1, 2) + new_weight = origin_weight.contiguous() + origin_weight.untyped_storage().resize_(0) + return new_weight + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + weight_data = self.release_weight_cache(layer.w13_weight.data) + layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False) + + weight_data = self.release_weight_cache(layer.w2_weight.data) + layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False) + + layer.w13_weight_scale = torch.nn.Parameter( + layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32), + requires_grad=False, + ) + layer.w2_weight_scale = torch.nn.Parameter( + layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False + ) + layer.w13_weight_offset = torch.nn.Parameter( + layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + ) + layer.w2_weight_offset = torch.nn.Parameter( + layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + ) + + layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) + layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) + + def create_moe_runner( + self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" + ): + self.moe_runner_config = moe_runner_config + + def apply( + self, + layer, + dispatch_output: "StandardDispatchOutput", + ) -> "CombineInput": + return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output) + + + def apply_without_routing_weights( + self, + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ): + return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype,) \ No newline at end of file diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py index e69de29bb2d1..997892772977 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 + +from .msmodelslim_scheme import ModelSlimScheme +from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8 +from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4 + +__all__ = [ + "ModelSlimScheme", + "ModelSlimW8A8Int8", + "ModelSlimW4A4Int4", +] diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py index e69de29bb2d1..7e6669abe412 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py @@ -0,0 +1,56 @@ +# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors +# SPDX-License-Identifier: Apache-2.0 + +from abc import ABC, abstractmethod +from typing import Optional + +import torch + +__all__ = ["ModelSlimScheme"] + + +class ModelSlimScheme(ABC): + """ + Abstract class used to describe the weight creation and forward pass + of different quantization schemes supported by CompressedTensors. + """ + + @classmethod + @abstractmethod + def get_min_capability(cls) -> int: + """ + Get minimum device capability. + """ + raise NotImplementedError + + @abstractmethod + def create_weights(self, *args, **kwargs): + """ + Weight creation for the particular scheme. Inputs to this function + + """ + raise NotImplementedError + + @abstractmethod + def apply_weights( + self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] + ): + """ + Run the forward pass for the particular scheme. This is where + scheme-specific dequant/quant steps/kernels should be applied. + + :param layer: torch.nn.Module with the registered weights and + other parameters relevant to the particular scheme. + :param x: input to the layer + :param bias: bias parameter + + """ + raise NotImplementedError + + @abstractmethod + def process_weights_after_loading(self, layer: torch.nn.Module): + """ + Called after weight loading is complete for any cleanup that + needs to occur. + """ + raise NotImplementedError diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py new file mode 100644 index 000000000000..87404a6269be --- /dev/null +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py @@ -0,0 +1,95 @@ +# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors +# SPDX-License-Identifier: Apache-2.0 + +from typing import Callable, Optional + +import torch +from torch.nn import Parameter + +from typing import Any, Dict, List + +from sglang.srt.hardware_backend.npu.utils import npu_format_cast +from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( + NPU_W4A4DynamicLinearMethod, +) +from sglang.srt.layers.parameter import ( + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) +from sglang.srt.layers.quantization.msmodelslim.schemes import ( + ModelSlimScheme, +) + +from sglang.srt.utils import set_weight_attrs + + +class ModelSlimW4A4Int4(ModelSlimScheme): + + def __init__( + self, quant_config: Dict[str, any], prefix: str, + ): + self.quant_config = quant_config + self.transpose_weight = True + self.is_dynamic = ( + self.quant_config[prefix + ".weight"] + == "W4A4_DYNAMIC" + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + + weight_dict = self.quant_method.get_weight( + input_size_per_partition, output_size_per_partition, params_dtype + ) + for weight_name, weight_param in weight_dict.items(): + param = torch.nn.Parameter(weight_param, requires_grad=False) + set_weight_attrs(param, {"input_dim": 1, "output_dim": 0}) + layer.register_parameter(weight_name, param) + set_weight_attrs(param, extra_weight_attrs) + + pertensor_dict = self.quant_method.get_pertensor_param(params_dtype) + for pertensor_name, pertensor_param in pertensor_dict.items(): + param = PerTensorScaleParameter( + data=pertensor_param, weight_loader=weight_loader + ) + # disable warning + param.ignore_warning = True + layer.register_parameter(pertensor_name, param) + + perchannel_dict = self.quant_method.get_perchannel_param( + output_size_per_partition, params_dtype + ) + for perchannel_name, perchannel_param in perchannel_dict.items(): + param = torch.nn.Parameter(perchannel_param, requires_grad=False) + set_weight_attrs(param, {"output_dim": 0}) + layer.register_parameter(perchannel_name, param) + set_weight_attrs(param, extra_weight_attrs) + + def process_weights_after_loading(self, layer): + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight_scale.data = layer.weight_scale.data.flatten() + layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32) + layer.weight_offset.data = layer.weight_offset.data.flatten() + layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack( + layer.weight.data.to(torch.int32) + ) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + return NPU_W4A4DynamicLinearMethod.apply(layer, x, bias) \ No newline at end of file diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py index e69de29bb2d1..7963c87200c4 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py @@ -0,0 +1,140 @@ +# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors +# SPDX-License-Identifier: Apache-2.0 + +from typing import Callable, Optional + +import torch +from torch.nn import Parameter + +from typing import Any, Dict, List + +from sglang.srt.hardware_backend.npu.utils import npu_format_cast +from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( + NPUW8A8Int8DynamicLinearMethod, + NPUW8A8Int8LinearMethod +) +from sglang.srt.layers.parameter import ( + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) +from sglang.srt.layers.quantization.msmodelslim.schemes import ( + ModelSlimScheme, +) + + +class ModelSlimW8A8Int8(ModelSlimScheme): + + def __init__( + self, quant_config: Dict[str, any], prefix: str, + ): + self.quant_config = quant_config + self.is_dynamic = ( + self.quant_config[prefix + ".weight"] + == "W8A8_DYNAMIC" + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + weight_loader = extra_weight_attrs.get("weight_loader") + output_size_per_partition = sum(output_partition_sizes) + + weight = ModelWeightParameter( + data=torch.empty( + (output_size_per_partition, input_size_per_partition), dtype=torch.int8 + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((output_size_per_partition, 1), dtype=params_dtype), + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight_scale", weight_scale) + + weight_offset = ChannelQuantScaleParameter( + data=torch.empty((output_size_per_partition, 1), dtype=params_dtype), + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight_offset", weight_offset) + + if not self.is_dynamic: + input_scale = PerTensorScaleParameter( + data=torch.empty(1, dtype=params_dtype), + weight_loader=weight_loader, + ) + input_scale.ignore_warning = True + layer.register_parameter("input_scale", input_scale) + + input_offset = PerTensorScaleParameter( + data=torch.empty(1, dtype=params_dtype), + weight_loader=weight_loader, + ) + input_offset.ignore_warning = True + layer.register_parameter("input_offset", input_offset) + + quant_bias = ChannelQuantScaleParameter( + data=torch.empty(output_size_per_partition, dtype=torch.int32), + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("quant_bias", quant_bias) + + if params_dtype == torch.bfloat16: + deq_scale_dtype = torch.float32 + elif params_dtype == torch.float16: + deq_scale_dtype = torch.int64 + else: + raise ValueError(f"Unsupported params_dtype: {params_dtype}") + deq_scale = ChannelQuantScaleParameter( + data=torch.empty(output_size_per_partition, dtype=deq_scale_dtype), + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("deq_scale", deq_scale) + + def process_weights_after_loading(self, layer: torch.nn.Module): + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight.data = npu_format_cast(layer.weight.data) + + layer.weight_scale.data = layer.weight_scale.data.flatten() + layer.weight_offset.data = layer.weight_offset.data.flatten() + + if not self.is_dynamic: + expanding_factor = layer.weight.data.shape[0] + layer.aclnn_input_scale = torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + layer.aclnn_input_offset = torch.nn.Parameter( + layer.input_offset.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if self.is_dynamic: + return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias) + else: + return NPUW8A8Int8LinearMethod.apply(layer, x, bias) \ No newline at end of file diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py deleted file mode 100644 index e69de29bb2d1..000000000000 From ccfe6f63ba3f74115f1239826d224df1c751d673 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 11 Dec 2025 13:54:24 +0300 Subject: [PATCH 014/175] Delete w4a16_moe.py --- .../npu/quantization/w4a16_moe.py | 195 ------------------ 1 file changed, 195 deletions(-) delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py deleted file mode 100644 index 2f3f2a4539f3..000000000000 --- a/python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py +++ /dev/null @@ -1,195 +0,0 @@ -from typing import TYPE_CHECKING - -import numpy as np -import torch - -from sglang.srt.hardware_backend.npu.utils import npu_format_cast -from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase - -class NPUW4A8Int4DynamicMoEMethod(FusedMoEMethodBase): - - def __init__(self) -> None: - self.group_size = 256 - self.tp_size = 1 - - def pack_to_int32(self, weight: torch.Tensor): - assert weight.dim() == 3 - if weight.dtype == torch.int32: - # pack 8 int4 to int32, we use a int32 to represent a int4 - assert ( - weight.shape[-1] % 8 == 0 - ), "the last dim of weight needs to be divided by 8" - new_weight = torch.ops.npu.npu_convert_weight_to_int4pack( - weight.flatten(0, 1) - ) - new_weight = new_weight.view(weight.shape[0], weight.shape[1], -1) - elif weight.dtype == torch.int8: - # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4 - assert ( - weight.shape[-1] % 4 == 0 - ), "the last dim of weight needs to be divided by 4" - new_weight = weight.view(torch.int32).contiguous() - else: - raise ValueError(f"{weight.dtype=} is not supported !") - return new_weight - - def unpack_from_int32( - self, - value: torch.Tensor, - num_bits: int, - shape: torch.Size = None, - packed_dim=1, - ) -> torch.Tensor: - """ - Unpacks a tensor of packed int32 weights into individual int8s, maintaining the - original bit range. - - Return tensors in int8 - - :param value: tensor to unpack - :param num_bits: number of bits to unpack each data point into - :param shape: shape to unpack into, used to remove padding - :returns: unpacked int8 tensor - """ - if value.dtype is not torch.int32: - raise ValueError( - f"Expected {torch.int32} but got {value.dtype}, Aborting unpack." - ) - - if num_bits > 8: - raise ValueError("Unpacking is only supported for less than 8 bits") - - pack_factor = 32 // num_bits - - # unpack - mask = (1 << num_bits) - 1 - - if packed_dim == 1: - unpacked = torch.zeros( - (value.shape[0], value.shape[1] * pack_factor), - device=value.device, - dtype=torch.int32, - ) - for i in range(pack_factor): - unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask - - # remove padding - if shape is not None: - original_row_size = int(shape[1]) - unpacked = unpacked[:, :original_row_size] - else: - unpacked = torch.zeros( - (value.shape[0] * pack_factor, value.shape[1]), - device=value.device, - dtype=torch.int32, - ) - for i in range(pack_factor): - unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask - - # remove padding - original_row_size = int(shape[0]) - unpacked = unpacked[:original_row_size, :] - - # bits are packed in unsigned format, reformat to signed - # update the value range from unsigned to signed - offset = pow(2, num_bits) // 2 - unpacked = (unpacked - offset).to(torch.int8) - - return unpacked - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - w13_weight_scale = layer.w13_weight_scale.data.transpose(-1, -2).contiguous() - w2_weight_scale = layer.w2_weight_scale.data.transpose(-1, -2).contiguous() - layer.w13_weight_scale = torch.nn.Parameter( - w13_weight_scale, requires_grad=False - ) - layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, requires_grad=False) - - layer.w13_weight_offset = torch.nn.Parameter( - layer.w13_weight_offset.data.transpose(-1, -2).contiguous(), - requires_grad=False, - ) - layer.w2_weight_offset = torch.nn.Parameter( - layer.w2_weight_offset.data.transpose(-1, -2).contiguous(), - requires_grad=False, - ) - - # w = [n, k // 8] --> [k, n // 8] - # w13_weight = layer.w13_weight.data.transpose(1, 2).contiguous() - # w2_weight = layer.w2_weight.data.transpose(1, 2).contiguous() - unpacked_w13_weight = ( - self.unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4) - .view(layer.w13_weight.data.shape[0], layer.w13_weight.data.shape[1], -1) - .transpose(1, 2) - .contiguous() - .int() - ) - unpacked_w2_weight = ( - self.unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4) - .view(layer.w2_weight.data.shape[0], layer.w2_weight.data.shape[1], -1) - .transpose(1, 2) - .contiguous() - .int() - ) - - w13_weight = self.pack_to_int32(unpacked_w13_weight) - w2_weight = self.pack_to_int32(unpacked_w2_weight) - - layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) - - def create_moe_runner( - self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" - ): - self.moe_runner_config = moe_runner_config - - def apply( - self, - layer, - dispatch_output: "StandardDispatchOutput", - ) -> "CombineInput": - # FIXME W4A8 only support with deepep - raise NotImplementedError( - f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep" - ) - - def apply_without_routing_weights( - self, - layer, - hidden_states, - hidden_states_scale, - group_list_type, - group_list, - output_dtype, - ): - hidden_states = torch.ops.npu.npu_grouped_matmul( - x=[hidden_states], - weight=[self.w13_weight], - scale=[self.w13_weight_scale], - bias=[self.w13_scale_bias], - per_token_scale=[hidden_states_scale], - group_list=group_list, - split_item=2, - group_type=0, - group_list_type=group_list_type, - output_dtype=output_dtype, - )[0] - - # act_fn: swiglu - hidden_states = torch.ops.npu.npu_swiglu(hidden_states) - hidden_states, swiglu_out_scale = torch.ops.npu.npu_dynamic_quant(hidden_states) - - hidden_states = torch.ops.npu.npu_grouped_matmul( - x=[hidden_states], - weight=[self.w2_weight], - scale=[self.w2_weight_scale], - bias=[self.w2_scale_bias], - per_token_scale=[swiglu_out_scale], - group_list=group_list, - split_item=2, - group_type=0, - group_list_type=group_list_type, - output_dtype=output_dtype, - )[0] - - return hidden_states From 0a48b2bb1007687394cb5dc6c6728d2b0d37eb55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 11 Dec 2025 13:54:38 +0300 Subject: [PATCH 015/175] Delete w4a8.py --- .../hardware_backend/npu/quantization/w4a8.py | 119 ------------------ 1 file changed, 119 deletions(-) delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a8.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py deleted file mode 100644 index 7cd4dc81486a..000000000000 --- a/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py +++ /dev/null @@ -1,119 +0,0 @@ -from __future__ import annotations - -import importlib -import sys -from types import MappingProxyType -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Mapping, - Optional, - Tuple, - Union, - cast, -) - -import torch -from torch.nn.parameter import Parameter - -from sglang.srt.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) -from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading -from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig -from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo -from sglang.srt.layers.parameter import ( - ChannelQuantScaleParameter, - ModelWeightParameter, - PerTensorScaleParameter, -) -from sglang.srt.layers.quantization.base_config import ( - FusedMoEMethodBase, - LinearMethodBase, - QuantizationConfig, - QuantizeMethodBase, -) -from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer -from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod -from sglang.srt.layers.quantization.w8a8_int8 import NPU_W8A8DynamicLinearMethod -from sglang.srt.utils import ( - apply_module_patch, - cpu_has_amx_support, - is_cpu, - is_cuda, - is_npu, - set_weight_attrs, - use_intel_amx_backend, -) - -if TYPE_CHECKING: - from sglang.srt.layers.moe.token_dispatcher import ( - CombineInput, - StandardDispatchOutput, - ) - -_is_cuda = is_cuda() -_is_cpu_amx_available = cpu_has_amx_support() -_is_cpu = is_cpu() -_is_npu = is_npu() - -if _is_npu: - import torch_npu - -class NPU_W4A8DynamicLinearMethod: - """Linear method for NPU W4A8_DYNAMIC.""" - - def __init__(self): - self.transpose_weight = True - try: - self.group_size = self.quantization_config.get("group_size", 256) - except AttributeError: - self.group_size = 256 - - @staticmethod - def process_scale_second(weight: torch.Tensor, scale: torch.Tensor, - per_group_scale: torch.Tensor): - k, n = weight.shape - group_num, n = per_group_scale.shape - weight_high = weight.to(torch.float32).reshape( - group_num, -1, n) * per_group_scale.reshape(group_num, 1, n) - weight_high = weight_high.reshape(k, n) - bias = 8 * (weight_high.to(torch.float32) * scale).sum(dim=0) - antiquant_scale = (scale * per_group_scale).reshape(group_num, n) - return antiquant_scale.npu(), bias - - @staticmethod - def apply( - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - tp_rank: Optional[int] = 0, - ) -> torch.Tensor: - group_size = 256 - return torch_npu.npu_weight_quant_batchmatmul( - x, - layer.weight, - antiquant_scale=layer.weight_scale_second.to(x.dtype), - antiquant_group_size=group_size, - ) - - def process_weights_after_loading(self, layer): - if self.transpose_weight: - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - - layer.weight_scale.data = layer.weight_scale.data.flatten().to( - torch.float32) - layer.weight_offset.data = layer.weight_offset.data.flatten() - layer.weight_scale_second.data, scale_bias = self.process_scale_second( - layer.weight.data, - layer.weight_scale.data, - layer.weight_scale_second.data.transpose(0, 1).contiguous(), - ) - param = torch.nn.Parameter(scale_bias, requires_grad=False) - layer.register_parameter("weight_scale_bias", param) - layer.weight.data = torch_npu.npu_convert_weight_to_int4pack( - layer.weight.data.to(torch.int32)) From f4fdb0e3e5a24c4c76bdb60d6d6b1092dc136431 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 11 Dec 2025 13:54:58 +0300 Subject: [PATCH 016/175] Delete w4a8_moe.py --- .../npu/quantization/w4a8_moe.py | 148 ------------------ 1 file changed, 148 deletions(-) delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py deleted file mode 100644 index 3696c4d36380..000000000000 --- a/python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import TYPE_CHECKING - -import numpy as np -import torch - -from sglang.srt.hardware_backend.npu.utils import npu_format_cast -from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase - -class NPUW4A8Int4DynamicMoEMethod(FusedMoEMethodBase): - - def __init__(self) -> None: - self.group_size = 256 - self.tp_size = 1 - - def process_scale(self, weight: torch.Tensor, scale, per_group_scale): - scale = scale.transpose(1, 2).contiguous() - per_group_scale = per_group_scale.transpose(1, 2).contiguous() - group_num, k, n = weight.shape - # the weight of the new version is reduced by half by pack n, so it needs to be restored - n = n * 2 - per_group_scale = per_group_scale.reshape(group_num, -1, n) - group_num, quantgroup_num, n = per_group_scale.shape - bias = None - - scale_fp32 = (scale * per_group_scale).to(torch.float16).to(torch.float32) - scale_fp32_np = scale_fp32.cpu().numpy() - scale_fp32_np.dtype = np.uint32 - sscale_uint64 = np.zeros((group_num, quantgroup_num, n * 2), dtype=np.uint32) - - sscale_uint64[..., ::2] = scale_fp32_np - - sscale_uint64_buffer = np.frombuffer( - sscale_uint64.tobytes(), dtype=np.int64 - ).copy() - sscale_uint64_tensor = torch.from_numpy(sscale_uint64_buffer).reshape( - group_num, quantgroup_num, n - ) - sscale_uint64_tensor = sscale_uint64_tensor.npu() - return sscale_uint64_tensor, bias - - def update_bias(self, layer, w13_bias, w2_bias): - layer.w13_scale_bias.data = ( - layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) - ) - layer.w2_scale_bias.data = ( - layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) - ) - - def pack_to_int32(self, weight: torch.Tensor): - # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4 - assert ( - weight.shape[-1] % 4 == 0 - ), "the last dim of weight needs to be divided by 4" - return weight.view(torch.int32).contiguous() - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - layer.w13_weight = torch.nn.Parameter( - layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False - ) - layer.w2_weight = torch.nn.Parameter( - layer.w2_weight.data.transpose(1, 2).contiguous(), requires_grad=False - ) - - w13_weight_scale_second = ( - layer.w13_weight_scale_second.data - if hasattr(layer, "w13_weight_scale_second") - else None - ) - w2_weight_scale_second = ( - layer.w2_weight_scale_second.data - if hasattr(layer, "w2_weight_scale_second") - else None - ) - layer.w13_weight_scale.data, w13_bias = self.process_scale( - layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second - ) - layer.w2_weight_scale.data, w2_bias = self.process_scale( - layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second - ) - if hasattr(layer, "w13_weight_scale_second"): - # scale_second is no longer used, release this part of the memory - del layer.w13_weight_scale_second - del layer.w2_weight_scale_second - del layer.w13_weight_offset_second - del layer.w2_weight_offset_second - - self.update_bias(layer, w13_bias, w2_bias) - - layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) - layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) - layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data) - layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) - - def create_moe_runner( - self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" - ): - self.moe_runner_config = moe_runner_config - - def apply( - self, - layer, - dispatch_output: "StandardDispatchOutput", - ) -> "CombineInput": - # FIXME W4A8 only support with deepep - raise NotImplementedError( - f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep" - ) - - def apply_without_routing_weights( - self, - layer, - hidden_states, - hidden_states_scale, - group_list_type, - group_list, - output_dtype, - ): - hidden_states = torch.ops.npu.npu_grouped_matmul( - x=[hidden_states], - weight=[self.w13_weight], - scale=[self.w13_weight_scale], - bias=[self.w13_scale_bias], - per_token_scale=[hidden_states_scale], - group_list=group_list, - split_item=2, - group_type=0, - group_list_type=group_list_type, - output_dtype=output_dtype, - )[0] - - # act_fn: swiglu - hidden_states = torch.ops.npu.npu_swiglu(hidden_states) - hidden_states, swiglu_out_scale = torch.ops.npu.npu_dynamic_quant(hidden_states) - - hidden_states = torch.ops.npu.npu_grouped_matmul( - x=[hidden_states], - weight=[self.w2_weight], - scale=[self.w2_weight_scale], - bias=[self.w2_scale_bias], - per_token_scale=[swiglu_out_scale], - group_list=group_list, - split_item=2, - group_type=0, - group_list_type=group_list_type, - output_dtype=output_dtype, - )[0] - - return hidden_states From 1f4f87015537668eb98b83708e5805474b097b8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 11 Dec 2025 13:55:12 +0300 Subject: [PATCH 017/175] Delete w8a8.py --- .../hardware_backend/npu/quantization/w8a8.py | 100 ------------------ 1 file changed, 100 deletions(-) delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w8a8.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py deleted file mode 100644 index f9ad7f4a16ac..000000000000 --- a/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py +++ /dev/null @@ -1,100 +0,0 @@ -from typing import TYPE_CHECKING, List, Optional - -import torch - -from sglang.srt.hardware_backend.npu.utils import npu_format_cast -from sglang.srt.hardware_backend.npu.quantization.utils import _NPULinearMethodBase - -class NPUW8A8Int8LinearMethod(_NPULinearMethodBase): - """Linear method for NPU W8A8.""" - - def __init__(self): - self.transpose_weight = True - - @staticmethod - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - from sglang.srt.layers.linear import RowParallelLinear - - original_dtype = x.dtype - if original_dtype != torch.int8: - x = torch.ops.npu.npu_quantize( - x, - layer.aclnn_input_scale_reciprocal, - layer.aclnn_input_offset, - torch.qint8, - -1, - False, - ) - # Only fuse bias add into GEMM for rank 0 (this ensures that - # bias will not get added more than once in Attention TP>1 case) - if isinstance(layer, RowParallelLinear) and layer.tp_rank > 0: - quant_bias = None - else: - quant_bias = layer.quant_bias - return torch.ops.npu.npu_quant_matmul( - x, - layer.weight, - layer.deq_scale, - bias=quant_bias, - output_dtype=original_dtype, - ) - - def process_weights_after_loading(self, layer: torch.nn.Module): - if self.transpose_weight: - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - layer.weight.data = npu_format_cast(layer.weight.data) - - layer.weight_scale.data = torch.flatten(layer.weight_scale.data) - layer.weight_offset.data = torch.flatten(layer.weight_offset.data) - - expanding_factor = layer.weight.data.shape[0] - layer.aclnn_input_scale = torch.nn.Parameter( - layer.input_scale.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) - layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter( - layer.input_scale.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) - layer.aclnn_input_offset = torch.nn.Parameter( - layer.input_offset.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) - - -class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase): - """Linear method for NPU W8A8_DYNAMIC.""" - - def __init__(self): - self.transpose_weight = True - - @staticmethod - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - original_dtype = x.dtype - quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant(x) - return torch.ops.npu.npu_quant_matmul( - quant_out, - layer.weight, - layer.weight_scale, - pertoken_scale=dynamic_scale, - bias=bias, - output_dtype=original_dtype, - ) - - def process_weights_after_loading(self, layer: torch.nn.Module): - if self.transpose_weight: - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - layer.weight.data = npu_format_cast(layer.weight.data) - - layer.weight_scale.data = layer.weight_scale.data.flatten() - layer.weight_offset.data = layer.weight_offset.data.flatten() From b5fcf782b0171e953ebbf344541538550716c2c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 11 Dec 2025 13:55:29 +0300 Subject: [PATCH 018/175] Delete w8a8_moe.py --- .../npu/quantization/w8a8_moe.py | 215 ------------------ 1 file changed, 215 deletions(-) delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py b/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py deleted file mode 100644 index 789e5b516ced..000000000000 --- a/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py +++ /dev/null @@ -1,215 +0,0 @@ -from typing import TYPE_CHECKING - -import numpy as np -import torch - -from sglang.srt.hardware_backend.npu.utils import npu_format_cast -from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase - -def npu_fused_experts( - hidden_states: torch.Tensor, - w13: torch.Tensor, - w13_scale: torch.Tensor, - w2: torch.Tensor, - w2_scale: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - top_k: int, - **kwargs, -): - w13_offset = kwargs.get("w13_offset", None) - w2_offset = kwargs.get("w2_offset", None) - use_wna16 = kwargs.get("use_wna16", False) - - original_shape = hidden_states.shape - original_dtype = hidden_states.dtype - scale_dtype = original_dtype if original_dtype == torch.bfloat16 else torch.float32 - if len(original_shape) == 3: - hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) - num_tokens = hidden_states.shape[0] - num_experts = w13.shape[0] - row_idx_len = num_tokens * top_k - row_idx = ( - torch.arange(0, row_idx_len, dtype=torch.int32, device=topk_weights.device) - .view(top_k, -1) - .permute(1, 0) - .contiguous() - ) - hidden_states, expanded_row_idx, expanded_expert_idx = ( - torch.ops.npu.npu_moe_init_routing( - hidden_states, row_idx=row_idx, expert_idx=topk_ids, active_num=num_tokens - ) - ) - expert_tokens = torch.ops.npu.npu_moe_compute_expert_tokens( - expanded_expert_idx, num_experts - ) - expert_tokens = expert_tokens.to(torch.int64) - # gmm1: gate_up_proj - if not use_wna16: - hidden_states, pertoken_scale = torch.ops.npu.npu_dynamic_quant(hidden_states) - scale_args13 = { - "scale": [w13_scale.to(scale_dtype)], - "per_token_scale": [pertoken_scale], - } - else: - scale_args13 = { - "antiquant_scale": [w13_scale], - "antiquant_offset": [w13_offset], - } - - hidden_states = torch.ops.npu.npu_grouped_matmul( - x=[hidden_states], - weight=[w13], - **scale_args13, - split_item=2, - group_list_type=0, - group_type=0, - group_list=expert_tokens, - output_dtype=original_dtype, - )[0] - # act_fn: swiglu - hidden_states = torch.ops.npu.npu_swiglu(hidden_states) - if not use_wna16: - hidden_states, pertoken_scale = torch.ops.npu.npu_dynamic_quant(hidden_states) - - scale_args2 = { - "scale": [w2_scale.to(scale_dtype)], - "per_token_scale": [pertoken_scale], - } - else: - scale_args2 = {"antiquant_scale": [w2_scale], "antiquant_offset": [w2_offset]} - # gmm2: down_proj - hidden_states = torch.ops.npu.npu_grouped_matmul( - x=[hidden_states], - weight=[w2], - **scale_args2, - split_item=2, - group_list_type=0, - group_type=0, - group_list=expert_tokens, - output_dtype=original_dtype, - )[0] - - final_hidden_states = torch.ops.npu.npu_moe_finalize_routing( - hidden_states, - skip1=None, - skip2=None, - bias=None, - scales=topk_weights, - expanded_src_to_dst_row=expanded_row_idx, - export_for_source_row=topk_ids, - ) - if len(original_shape) == 3: - final_hidden_states = final_hidden_states.view(original_shape) - return final_hidden_states - -class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase): - - ### TODO remove this ### - def release_weight_cache(self, weight: torch.Tensor): - # .contiguous() introduces additional memory overhead and needs to be released using resize_(0) - origin_weight = weight.data.transpose(1, 2) - new_weight = origin_weight.contiguous() - origin_weight.untyped_storage().resize_(0) - return new_weight - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - weight_data = self.release_weight_cache(layer.w13_weight.data) - layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False) - - weight_data = self.release_weight_cache(layer.w2_weight.data) - layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False) - - layer.w13_weight_scale = torch.nn.Parameter( - layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32), - requires_grad=False, - ) - layer.w2_weight_scale = torch.nn.Parameter( - layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False - ) - layer.w13_weight_offset = torch.nn.Parameter( - layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False - ) - layer.w2_weight_offset = torch.nn.Parameter( - layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False - ) - - layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) - layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) - - def create_moe_runner( - self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" - ): - self.moe_runner_config = moe_runner_config - - def apply( - self, - layer, - dispatch_output: "StandardDispatchOutput", - ) -> "CombineInput": - from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput - - x = dispatch_output.hidden_states - topk_output = dispatch_output.topk_output - - topk_weights, topk_ids, _ = topk_output - topk_ids = topk_ids.to(torch.int32) - topk_weights = topk_weights.to(x.dtype) - output = npu_fused_experts( - hidden_states=x, - w13=layer.w13_weight, - w13_scale=layer.w13_weight_scale, - w2=layer.w2_weight, - w2_scale=layer.w2_weight_scale, - topk_weights=topk_weights, - topk_ids=topk_ids, - top_k=topk_ids.shape[1], - ) - return StandardCombineInput(hidden_states=output) - - def apply_without_routing_weights( - self, - layer, - hidden_states, - hidden_states_scale, - group_list_type, - group_list, - output_dtype, - ): - # gmm1: gate_up_proj - hidden_states = torch.ops.npu.npu_grouped_matmul( - x=[hidden_states], - weight=[layer.w13_weight], - split_item=2, - group_list_type=group_list_type, - group_type=0, - group_list=group_list, - output_dtype=torch.int32, - )[0] - - # act_fn: swiglu - hidden_states, swiglu_out_scale = torch.ops.npu.npu_dequant_swiglu_quant( - x=hidden_states, - weight_scale=layer.w13_weight_scale, - activation_scale=hidden_states_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=group_list, - activate_left=True, - quant_mode=1, - ) - - # gmm2: down_proj - hidden_states = torch.ops.npu.npu_grouped_matmul( - x=[hidden_states], - weight=[layer.w2_weight], - scale=[layer.w2_weight_scale.to(output_dtype)], - per_token_scale=[swiglu_out_scale], - split_item=2, - group_list_type=group_list_type, - group_type=0, - group_list=group_list, - output_dtype=output_dtype, - )[0] - return hidden_states From ba57bc71a47409e436772f2c435503a0cfcbbf30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 11 Dec 2025 13:55:43 +0300 Subject: [PATCH 019/175] Delete utils.py --- .../hardware_backend/npu/quantization/utils.py | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/utils.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/utils.py b/python/sglang/srt/hardware_backend/npu/quantization/utils.py deleted file mode 100644 index 0350d85e6400..000000000000 --- a/python/sglang/srt/hardware_backend/npu/quantization/utils.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import TYPE_CHECKING, List, Optional - -from sglang.srt.layers.quantization.base_config import LinearMethodBase - -if TYPE_CHECKING: - from sglang.srt.layers.quantization.base_config import QuantizationConfig - -class _NPULinearMethodBase(LinearMethodBase): - - def __init__( - self, - quant_config: Optional["QuantizationConfig"] = None, - ): - super().__init__() - self.quant_config = quant_config From a5704f1655c51a3438a43171bb39c308e86478a2 Mon Sep 17 00:00:00 2001 From: TamirBaydasov Date: Thu, 11 Dec 2025 19:35:24 +0300 Subject: [PATCH 020/175] Move process_weights to kernel-side, add npu compressed-tensors w8a8int8 support --- .../npu/quantization/fused_moe_method_npu.py | 33 ++++ .../npu/quantization/linear_method_npu.py | 40 +++++ .../compressed_tensors/compressed_tensors.py | 41 +++-- .../compressed_tensors/schemes/__init__.py | 8 +- .../schemes/compressed_tensors_w8a8_int8.py | 155 +++++++++++------- .../quantization/msmodelslim/msmodelslim.py | 2 +- .../msmodelslim/msmodelslim_moe.py | 30 +--- .../schemes/msmodelslim_w4a4_int4.py | 10 +- .../schemes/msmodelslim_w8a8_int8.py | 24 +-- 9 files changed, 216 insertions(+), 127 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 877a0c406355..5b5098ed567c 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -141,6 +141,39 @@ def npu_fused_moe_without_routing_weights_bf16( class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase): + + def release_weight_cache(self, weight: torch.Tensor): + # .contiguous() introduces additional memory overhead and needs to be released using resize_(0) + origin_weight = weight.data.transpose(1, 2) + new_weight = origin_weight.contiguous() + origin_weight.untyped_storage().resize_(0) + return new_weight + + @classmethod + def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: + weight_data = cls.release_weight_cache(layer.w13_weight.data) + layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False) + + weight_data = cls.release_weight_cache(layer.w2_weight.data) + layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False) + + layer.w13_weight_scale = torch.nn.Parameter( + layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32), + requires_grad=False, + ) + layer.w2_weight_scale = torch.nn.Parameter( + layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False + ) + layer.w13_weight_offset = torch.nn.Parameter( + layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + ) + layer.w2_weight_offset = torch.nn.Parameter( + layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + ) + + layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) + layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) + @staticmethod def apply( layer, diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py index 7d61255e17e6..6481b4f79bf4 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py @@ -58,6 +58,28 @@ def apply( output_dtype=original_dtype, ) + @staticmethod + def process_weights_after_loading(layer: torch.nn.Module): + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight.data = npu_format_cast(layer.weight.data) + + layer.weight_scale.data = layer.weight_scale.data.flatten() + layer.weight_offset.data = layer.weight_offset.data.flatten() + + expanding_factor = layer.weight.data.shape[0] + layer.aclnn_input_scale = torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + layer.aclnn_input_offset = torch.nn.Parameter( + layer.input_offset.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase): @@ -78,6 +100,14 @@ def apply( output_dtype=original_dtype, ) + @staticmethod + def process_weights_after_loading(layer: torch.nn.Module): + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight.data = npu_format_cast(layer.weight.data) + + layer.weight_scale.data = layer.weight_scale.data.flatten() + layer.weight_offset.data = layer.weight_offset.data.flatten() + class NPU_W4A4DynamicLinearMethod(_NPULinearMethodBase): @@ -99,4 +129,14 @@ def apply( pertoken_scale=dynamic_scale, bias=bias, output_dtype=original_dtype, + ) + + @staticmethod + def process_weights_after_loading(layer): + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight_scale.data = layer.weight_scale.data.flatten() + layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32) + layer.weight_offset.data = layer.weight_offset.data.flatten() + layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack( + layer.weight.data.to(torch.int32) ) \ No newline at end of file diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index 31f47b88bc2f..c8f8b6ee073d 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -31,7 +31,8 @@ WNA16_SUPPORTED_BITS, CompressedTensorsScheme, CompressedTensorsW8A8Fp8, - CompressedTensorsW8A8Int8, + GPUCompressedTensorsW8A8Int8, + NPUCompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16, ) @@ -42,6 +43,10 @@ ) from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod +from sglang.srt.utils import is_npu + +_is_npu = is_npu() + logger = logging.getLogger(__name__) @@ -439,18 +444,32 @@ def _get_scheme_from_parts( ) if self._is_static_tensor_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8Int8( - strategy=weight_quant.strategy, - is_static_input_scheme=True, - input_symmetric=input_quant.symmetric, - ) + if _is_npu: + return NPUCompressedTensorsW8A8Int8( + strategy=weight_quant.strategy, + is_static_input_scheme=True, + input_symmetric=input_quant.symmetric, + ) + else: + return GPUCompressedTensorsW8A8Int8( + strategy=weight_quant.strategy, + is_static_input_scheme=True, + input_symmetric=input_quant.symmetric, + ) if self._is_dynamic_token_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8Int8( - strategy=weight_quant.strategy, - is_static_input_scheme=False, - input_symmetric=input_quant.symmetric, - ) + if _is_npu: + return NPUCompressedTensorsW8A8Int8( + strategy=weight_quant.strategy, + is_static_input_scheme=False, + input_symmetric=input_quant.symmetric, + ) + else: + return GPUCompressedTensorsW8A8Int8( + strategy=weight_quant.strategy, + is_static_input_scheme=False, + input_symmetric=input_quant.symmetric, + ) raise NotImplementedError("No compressed-tensors compatible scheme was found.") diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py index 6d9871917bbb..e424e5d7b448 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py @@ -2,7 +2,10 @@ from .compressed_tensors_scheme import CompressedTensorsScheme from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8 -from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8 +from .compressed_tensors_w8a8_int8 import ( + GPUCompressedTensorsW8A8Int8, + NPUCompressedTensorsW8A8Int8, +) from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8 from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16 @@ -10,7 +13,8 @@ "CompressedTensorsScheme", "CompressedTensorsW8A8Fp8", "CompressedTensorsW8A16Fp8", - "CompressedTensorsW8A8Int8", + "GPUCompressedTensorsW8A8Int8", + "NPUCompressedTensorsW8A8Int8", "CompressedTensorsWNA16", "WNA16_SUPPORTED_BITS", ] diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 9bca2834d646..278584198919 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -15,6 +15,10 @@ from sglang.srt.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme, ) +from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( + NPUW8A8Int8DynamicLinearMethod, + NPUW8A8Int8LinearMethod +) from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8 from sglang.srt.layers.quantization.utils import requantize_with_max_scale from sglang.srt.utils import is_cuda @@ -33,6 +37,73 @@ def __init__( self.is_static_input_scheme = is_static_input_scheme self.input_symmetric = input_symmetric + def create_weights( + self, + layer: torch.nn.Module, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + + # WEIGHT + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, input_size_per_partition, dtype=torch.int8 + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + if self.strategy == QuantizationStrategy.CHANNEL: + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + else: + assert self.strategy == QuantizationStrategy.TENSOR + weight_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + if self.is_static_input_scheme: + input_scale = PerTensorScaleParameter( + data=torch.empty(1, dtype=torch.float32), weight_loader=weight_loader + ) + layer.register_parameter("input_scale", input_scale) + + if not self.input_symmetric: + # Note: compressed-tensors stores the zp using the same dtype + # as the weights + # AZP loaded as int8 but used as int32 + input_zero_point = PerTensorScaleParameter( + data=torch.empty(1, dtype=torch.int8), weight_loader=weight_loader + ) + layer.register_parameter("input_zero_point", input_zero_point) + + +class GPUCompressedTensorsW8A8Int8(CompressedTensorsScheme): + + def __init__( + self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool + ): + super.__init__( + strategy, + is_static_input_scheme, + input_symmetric + ) + @classmethod def get_min_capability(cls) -> int: # lovelace and up @@ -107,61 +178,6 @@ def process_weights_after_loading(self, layer) -> None: else: layer.azp_adj = None - def create_weights( - self, - layer: torch.nn.Module, - output_partition_sizes: list[int], - input_size_per_partition: int, - params_dtype: torch.dtype, - weight_loader: Callable, - **kwargs, - ): - output_size_per_partition = sum(output_partition_sizes) - layer.logical_widths = output_partition_sizes - - # WEIGHT - weight = ModelWeightParameter( - data=torch.empty( - output_size_per_partition, input_size_per_partition, dtype=torch.int8 - ), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, - ) - - layer.register_parameter("weight", weight) - - # WEIGHT SCALE - if self.strategy == QuantizationStrategy.CHANNEL: - weight_scale = ChannelQuantScaleParameter( - data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), - output_dim=0, - weight_loader=weight_loader, - ) - else: - assert self.strategy == QuantizationStrategy.TENSOR - weight_scale = PerTensorScaleParameter( - data=torch.empty(len(output_partition_sizes), dtype=torch.float32), - weight_loader=weight_loader, - ) - layer.register_parameter("weight_scale", weight_scale) - - # INPUT SCALE - if self.is_static_input_scheme: - input_scale = PerTensorScaleParameter( - data=torch.empty(1, dtype=torch.float32), weight_loader=weight_loader - ) - layer.register_parameter("input_scale", input_scale) - - if not self.input_symmetric: - # Note: compressed-tensors stores the zp using the same dtype - # as the weights - # AZP loaded as int8 but used as int32 - input_zero_point = PerTensorScaleParameter( - data=torch.empty(1, dtype=torch.int8), weight_loader=weight_loader - ) - layer.register_parameter("input_zero_point", input_zero_point) - def apply_weights( self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] ) -> torch.Tensor: @@ -171,3 +187,32 @@ def apply_weights( return int8_scaled_mm( x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias ) + + +class NPUCompressedTensorsW8A8Int8(CompressedTensorsScheme): + + def __init__( + self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool + ): + super.__init__( + strategy, + is_static_input_scheme, + input_symmetric + ) + + @classmethod + def get_min_capability(cls) -> int: + return NotImplementedError + + def process_weights_after_loading(self, layer): + if self.is_static_input_scheme: + return NPUW8A8Int8LinearMethod.process_weights_after_loading(layer) + else: + return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) + + def apply_weights(self, layer, x, bias): + if self.is_static_input_scheme: + return NPUW8A8Int8LinearMethod.apply(layer) + else: + return NPUW8A8Int8DynamicLinearMethod.apply(layer) + diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index b28893f0f42e..0e250ea8c573 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -189,7 +189,7 @@ def get_quant_method( ModelSlimLinearMethod(self) ) elif isinstance(layer, FusedMoE): - return ModelSlimMoeMethod.get_moe_method(self, layer, prefix) + return ModelSlimMoEMethod.get_moe_method(self, layer, prefix) return None def _get_scheme_from_parts( diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 5dd239a6d1ab..c12dcf39fd47 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -153,36 +153,8 @@ def create_weights( layer.register_parameter("w2_weight_offset", w2_weight_offset) set_weight_attrs(w2_weight_offset, extra_weight_attrs) - def release_weight_cache(self, weight: torch.Tensor): - # .contiguous() introduces additional memory overhead and needs to be released using resize_(0) - origin_weight = weight.data.transpose(1, 2) - new_weight = origin_weight.contiguous() - origin_weight.untyped_storage().resize_(0) - return new_weight - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - weight_data = self.release_weight_cache(layer.w13_weight.data) - layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False) - - weight_data = self.release_weight_cache(layer.w2_weight.data) - layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False) - - layer.w13_weight_scale = torch.nn.Parameter( - layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32), - requires_grad=False, - ) - layer.w2_weight_scale = torch.nn.Parameter( - layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False - ) - layer.w13_weight_offset = torch.nn.Parameter( - layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False - ) - layer.w2_weight_offset = torch.nn.Parameter( - layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False - ) - - layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) - layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) + NPUW8A8Int8DynamicMoEMethod.process_weights_after_loading(layer) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py index 87404a6269be..1d633fcbb06a 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py @@ -30,7 +30,6 @@ def __init__( self, quant_config: Dict[str, any], prefix: str, ): self.quant_config = quant_config - self.transpose_weight = True self.is_dynamic = ( self.quant_config[prefix + ".weight"] == "W4A4_DYNAMIC" @@ -77,14 +76,7 @@ def create_weights( set_weight_attrs(param, extra_weight_attrs) def process_weights_after_loading(self, layer): - if self.transpose_weight: - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - layer.weight_scale.data = layer.weight_scale.data.flatten() - layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32) - layer.weight_offset.data = layer.weight_offset.data.flatten() - layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack( - layer.weight.data.to(torch.int32) - ) + NPU_W4A4DynamicLinearMethod.process_weights_after_loading(layer) def apply_weights( self, diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py index 7963c87200c4..b33764b858a9 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py @@ -107,26 +107,10 @@ def create_weights( layer.register_parameter("deq_scale", deq_scale) def process_weights_after_loading(self, layer: torch.nn.Module): - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - layer.weight.data = npu_format_cast(layer.weight.data) - - layer.weight_scale.data = layer.weight_scale.data.flatten() - layer.weight_offset.data = layer.weight_offset.data.flatten() - - if not self.is_dynamic: - expanding_factor = layer.weight.data.shape[0] - layer.aclnn_input_scale = torch.nn.Parameter( - layer.input_scale.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) - layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter( - layer.input_scale.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) - layer.aclnn_input_offset = torch.nn.Parameter( - layer.input_offset.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) + if self.is_dynamic: + NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) + else: + NPUW8A8Int8LinearMethod.process_weights_after_loading(layer) def apply_weights( self, From c42c8f1be2abc9c32c478a36467aab1518674bf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 12 Dec 2025 16:38:38 +0300 Subject: [PATCH 021/175] Added check for empty scheme --- .../sglang/srt/layers/quantization/msmodelslim/msmodelslim.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index 0e250ea8c573..383ca74c0f02 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -183,7 +183,7 @@ def get_quant_method( return UnquantizedLinearMethod() scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config) if scheme is None: - return UnquantizedLinearMethod() + raise NotImplementedError("At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes.") layer.scheme = scheme return ( ModelSlimLinearMethod(self) From 25d0d09c1fba729f2c06080f07626b5269e422b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 12 Dec 2025 16:50:32 +0300 Subject: [PATCH 022/175] Remove unnecessary method --- .../msmodelslim/schemes/msmodelslim_scheme.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py index 7e6669abe412..1d09c384ca9e 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py @@ -15,14 +15,6 @@ class ModelSlimScheme(ABC): of different quantization schemes supported by CompressedTensors. """ - @classmethod - @abstractmethod - def get_min_capability(cls) -> int: - """ - Get minimum device capability. - """ - raise NotImplementedError - @abstractmethod def create_weights(self, *args, **kwargs): """ From ca4895ed635eaff99e23ceaf37274e564908d561 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 12 Dec 2025 17:47:24 +0300 Subject: [PATCH 023/175] Add w4a8 support --- .../msmodelslim/msmodelslim_moe.py | 212 ++++++++++++++++-- 1 file changed, 194 insertions(+), 18 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index c12dcf39fd47..3b7d5172541e 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -17,6 +17,7 @@ ) from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( + NPUW4A8Int8DynamicMoEMethod, NPUW8A8Int8DynamicMoEMethod, ) @@ -37,6 +38,7 @@ __all__ = [ "ModelSlimMoEMethod", + "ModelSlimW4A8Int8MoE", "ModelSlimW8A8Int8MoE", ] @@ -56,24 +58,198 @@ def get_moe_method( # TODO: @dsikka: refactor this to use schemes as other kernels # are supported + check if the layer is being ignored. + prefix_in_quant_config = prefix + ".0.down_proj.weight" + is_moe_w4a8_dynamic = ( + quant_config.quant_description.get(prefix_in_quant_config, "STATIC") + == "W4A8_DYNAMIC" + ) + + if is_moe_w4a8_dynamic: + return ModelSlimW4A8Int8MoE(quant_config) + return ModelSlimW8A8Int8MoE(quant_config) - # weight_quant = quant_config.target_scheme_map["Linear"].get("weights") - # input_quant = quant_config.target_scheme_map["Linear"].get("input_activations") - # is_moe_w4_dynamic = quant_config.is_dynamic_token_w4(weight_quant, input_quant) - # is_moe_input_quant = input_quant - - # if ( - # is_moe_w4_dynamic and is_moe_input_quant is not None - # ) or quant_config._is_moe_w4a8_dynamic(prefix, weight_quant, input_quant): - # return NPUW4A8Int4DynamicMoEMethod(quant_config) - # elif is_moe_w4_dynamic and is_moe_input_quant is None: - # return NPUW4A16Int4DynamicMoEMethod(quant_config) - # else: - # return NPUW8A8Int8DynamicMoEMethod(quant_config) - # else: - # raise RuntimeError( - # f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}" - # ) + + +class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod): + + def __init__( + self, quant_config: Dict[str, Any], prefix: str = None, + ): + self.quant_config = quant_config + self.group_size = 0 + self.tp_size = 1 + self.is_per_channel_weight = self.group_size == 0 + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported + + self.num_experts = num_experts + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} + ) + + # >> weight + w13_output_size = intermediate_size_per_partition + w2_output_size = hidden_size // 2 + w13_weight = torch.nn.Parameter( + torch.empty(num_experts, w13_output_size, hidden_size, dtype=torch.int8), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + w2_output_size, + intermediate_size_per_partition, + dtype=torch.int8, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # >> scale + w13_weight_scale = torch.nn.Parameter( + torch.empty( + num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + + w2_weight_scale = torch.nn.Parameter( + torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # >> offset + w13_weight_offset = torch.nn.Parameter( + torch.empty( + num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_offset", w13_weight_offset) + set_weight_attrs(w13_weight_offset, extra_weight_attrs) + + w2_weight_offset = torch.nn.Parameter( + torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_weight_offset", w2_weight_offset) + set_weight_attrs(w2_weight_offset, extra_weight_attrs) + + if not self.is_per_channel_weight: + # >>> special param for w4a8 + w13_weight_scale_second = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // self.group_size, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second) + set_weight_attrs(w13_weight_scale_second, extra_weight_attrs) + w13_weight_offset_second = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // self.group_size, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_offset_second", w13_weight_offset_second) + set_weight_attrs(w13_weight_offset_second, extra_weight_attrs) + + w2_weight_scale_second = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // self.group_size, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale_second", w2_weight_scale_second) + set_weight_attrs(w2_weight_scale_second, extra_weight_attrs) + + w2_weight_offset_second = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // self.group_size, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_offset_second", w2_weight_offset_second) + set_weight_attrs(w2_weight_offset_second, extra_weight_attrs) + + w13_scale_bias = torch.nn.Parameter( + torch.empty( + num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w13_scale_bias", w13_scale_bias) + set_weight_attrs(w13_scale_bias, extra_weight_attrs) + + w2_scale_bias = torch.nn.Parameter( + torch.empty( + num_experts, hidden_size, 16 // self.tp_size, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w2_scale_bias", w2_scale_bias) + set_weight_attrs(w2_scale_bias, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + print(layer) + NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(layer) + + def create_moe_runner( + self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" + ): + self.moe_runner_config = moe_runner_config + + def apply( + self, + layer, + dispatch_output: "StandardDispatchOutput", + ) -> "CombineInput": + return NPUW4A8Int8DynamicMoEMethod.apply(layer, dispatch_output) + + + def apply_without_routing_weights( + self, + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ): + return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype,) class ModelSlimW8A8Int8MoE(ModelSlimMoEMethod): @@ -183,4 +359,4 @@ def apply_without_routing_weights( hidden_states_scale, group_list_type, group_list, - output_dtype,) \ No newline at end of file + output_dtype,) From 28ff8e09539e492ab1b8dfd6e0ca2fe435034534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 12 Dec 2025 17:48:48 +0300 Subject: [PATCH 024/175] Add w4a8 support (kernel) --- .../npu/quantization/fused_moe_method_npu.py | 254 ++++++++---------- 1 file changed, 106 insertions(+), 148 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 5b5098ed567c..ac32ad5035b0 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -247,151 +247,22 @@ def apply_without_routing_weights( return hidden_states -class NPUW4A8Int4DynamicMoEMethod(FusedMoEMethodBase): +class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase): def __init__(self) -> None: - self.group_size = 256 + self.group_size = 0 ### TODO or 256 self.tp_size = 1 + self.is_per_channel_weight = self.group_size == 0 - def create_weights( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ) -> None: - from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported - - self.num_experts = num_experts - extra_weight_attrs.update( - {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} - ) - - # >> weight - w13_output_size = intermediate_size_per_partition - w2_output_size = hidden_size // 2 - w13_weight = torch.nn.Parameter( - torch.empty(num_experts, w13_output_size, hidden_size, dtype=torch.int8), - requires_grad=False, - ) - layer.register_parameter("w13_weight", w13_weight) - set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter( - torch.empty( - num_experts, - w2_output_size, - intermediate_size_per_partition, - dtype=torch.int8, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight", w2_weight) - set_weight_attrs(w2_weight, extra_weight_attrs) - - # >> scale - w13_weight_scale = torch.nn.Parameter( - torch.empty( - num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight_scale", w13_weight_scale) - set_weight_attrs(w13_weight_scale, extra_weight_attrs) - - w2_weight_scale = torch.nn.Parameter( - torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), - requires_grad=False, - ) - layer.register_parameter("w2_weight_scale", w2_weight_scale) - set_weight_attrs(w2_weight_scale, extra_weight_attrs) - - # >> offset - w13_weight_offset = torch.nn.Parameter( - torch.empty( - num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight_offset", w13_weight_offset) - set_weight_attrs(w13_weight_offset, extra_weight_attrs) - - w2_weight_offset = torch.nn.Parameter( - torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), - requires_grad=False, - ) - layer.register_parameter("w2_weight_offset", w2_weight_offset) - set_weight_attrs(w2_weight_offset, extra_weight_attrs) - - # >>> special param for w4a8 - w13_weight_scale_second = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size // self.group_size, - dtype=torch.float32, - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second) - set_weight_attrs(w13_weight_scale_second, extra_weight_attrs) - w13_weight_offset_second = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size // self.group_size, - dtype=torch.float32, - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight_offset_second", w13_weight_offset_second) - set_weight_attrs(w13_weight_offset_second, extra_weight_attrs) - - w2_weight_scale_second = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition // self.group_size, - dtype=torch.float32, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight_scale_second", w2_weight_scale_second) - set_weight_attrs(w2_weight_scale_second, extra_weight_attrs) - - w2_weight_offset_second = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition // self.group_size, - dtype=torch.float32, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight_offset_second", w2_weight_offset_second) - set_weight_attrs(w2_weight_offset_second, extra_weight_attrs) - - w13_scale_bias = torch.nn.Parameter( - torch.empty( - num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 - ), - requires_grad=False, - ) - layer.register_parameter("w13_scale_bias", w13_scale_bias) - set_weight_attrs(w13_scale_bias, extra_weight_attrs) - - w2_scale_bias = torch.nn.Parameter( - torch.empty( - num_experts, hidden_size, 16 // self.tp_size, dtype=torch.float32 - ), - requires_grad=False, - ) - layer.register_parameter("w2_scale_bias", w2_scale_bias) - set_weight_attrs(w2_scale_bias, extra_weight_attrs) - + @classmethod def process_scale(self, weight: torch.Tensor, scale, per_group_scale): scale = scale.transpose(1, 2).contiguous() + if self.is_per_channel_weight: + scale_np = scale.cpu().numpy() + scale_np.dtype = np.uint32 + scale_uint64_tensor = torch.from_numpy(scale_np.astype( + np.int64)).npu() + return scale_uint64_tensor, None per_group_scale = per_group_scale.transpose(1, 2).contiguous() group_num, k, n = weight.shape # the weight of the new version is reduced by half by pack n, so it needs to be restored @@ -416,6 +287,7 @@ def process_scale(self, weight: torch.Tensor, scale, per_group_scale): sscale_uint64_tensor = sscale_uint64_tensor.npu() return sscale_uint64_tensor, bias + @classmethod def update_bias(self, layer, w13_bias, w2_bias): layer.w13_scale_bias.data = ( layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) @@ -424,6 +296,7 @@ def update_bias(self, layer, w13_bias, w2_bias): layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) ) + @classmethod def pack_to_int32(self, weight: torch.Tensor): # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4 assert ( @@ -431,6 +304,7 @@ def pack_to_int32(self, weight: torch.Tensor): ), "the last dim of weight needs to be divided by 4" return weight.view(torch.int32).contiguous() + @classmethod def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w13_weight = torch.nn.Parameter( layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False @@ -469,21 +343,105 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data) layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) - def create_moe_runner( - self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" - ): - self.moe_runner_config = moe_runner_config - + @classmethod def apply( self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": - # FIXME W4A8 only support with deepep - raise NotImplementedError( - f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep" - ) + from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput + hidden_states = dispatch_output.hidden_states + topk_output = dispatch_output.topk_output + + topk_weights, topk_ids, _ = topk_output + top_k=topk_ids.shape[1] + group_list_type = 1 + + self.original_shape = hidden_states.shape + self.topk_weights = topk_weights + + num_tokens = hidden_states.shape[:-1].numel() + + first_expert_idx = 0 + last_expert_idx = 128 + global_num_experts = 128 + + sorted_hidden_states, self.expanded_row_idx, expert_tokens, pertoken_scale = ( + torch.ops.npu.npu_moe_init_routing_v2( + hidden_states, + topk_ids, + active_num=num_tokens * top_k, + expert_num=global_num_experts, + expert_tokens_num_type=1, + expert_tokens_num_flag=True, + active_expert_range=[first_expert_idx, last_expert_idx], + quant_mode=1, + )) + + expert_tokens = expert_tokens.to(torch.int64) + + bias1 = [layer.w13_scale_bias] + bias2 = [layer.w2_scale_bias] + w1_scale = [layer.w13_weight_scale] + w2_scale = [layer.w2_weight_scale] + # TODO w4a8 scene: dynamic acquisition of dtype in the future + _output_dtype = torch.bfloat16 + + hidden_states = torch.ops.npu.npu_grouped_matmul( + x=[sorted_hidden_states], + weight=[layer.w13_weight], + scale=w1_scale, + bias=bias1, + per_token_scale=[pertoken_scale], + group_list=expert_tokens, + split_item=2, + group_type=0, + group_list_type=group_list_type, + output_dtype=_output_dtype, + )[0] + + # act_fn: swiglu + hidden_states = torch.ops.npu.npu_swiglu(hidden_states) + hidden_states, swiglu_out_scale = torch.ops.npu.npu_dynamic_quant(hidden_states) + + output = torch.ops.npu.npu_grouped_matmul( + x=[hidden_states], + weight=[layer.w2_weight], + scale=w2_scale, + bias=bias2, + per_token_scale=[swiglu_out_scale], + group_list=expert_tokens, + split_item=2, + group_type=0, + group_list_type=group_list_type, + output_dtype=_output_dtype, + )[0] + + final_hidden_states = self.token_combine(hidden_states=output) + + return StandardCombineInput(hidden_states=final_hidden_states) + + @classmethod + def token_combine(self, + hidden_states: torch.Tensor, + bias: torch.Tensor = None): + assert self.original_shape is not None + final_hidden_states = torch.ops.npu.npu_moe_token_unpermute( + permuted_tokens=hidden_states, + sorted_indices=torch.abs(self.expanded_row_idx), + probs=self.topk_weights) + if len(self.original_shape) == 3: + final_hidden_states = final_hidden_states.view(self.original_shape) + + # these values are no longer used, so they need to be set to None for memory release. + self.expert_map = None + self.topk_weights = None + self.topk_ids = None + self.expanded_row_idx = None + return final_hidden_states + + @classmethod def apply_without_routing_weights( self, layer, From d9412d4fadde6f9fcf4e94fcd5f157a7f1d58536 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Fri, 12 Dec 2025 18:21:47 +0300 Subject: [PATCH 025/175] Update fused_moe_method_npu.py --- .../npu/quantization/fused_moe_method_npu.py | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index ac32ad5035b0..91a0c633e0db 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -254,7 +254,6 @@ def __init__(self) -> None: self.tp_size = 1 self.is_per_channel_weight = self.group_size == 0 - @classmethod def process_scale(self, weight: torch.Tensor, scale, per_group_scale): scale = scale.transpose(1, 2).contiguous() if self.is_per_channel_weight: @@ -287,7 +286,6 @@ def process_scale(self, weight: torch.Tensor, scale, per_group_scale): sscale_uint64_tensor = sscale_uint64_tensor.npu() return sscale_uint64_tensor, bias - @classmethod def update_bias(self, layer, w13_bias, w2_bias): layer.w13_scale_bias.data = ( layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) @@ -296,7 +294,6 @@ def update_bias(self, layer, w13_bias, w2_bias): layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) ) - @classmethod def pack_to_int32(self, weight: torch.Tensor): # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4 assert ( @@ -305,7 +302,7 @@ def pack_to_int32(self, weight: torch.Tensor): return weight.view(torch.int32).contiguous() @classmethod - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: layer.w13_weight = torch.nn.Parameter( layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False ) @@ -323,10 +320,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if hasattr(layer, "w2_weight_scale_second") else None ) - layer.w13_weight_scale.data, w13_bias = self.process_scale( + layer.w13_weight_scale.data, w13_bias = cls.process_scale( layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second ) - layer.w2_weight_scale.data, w2_bias = self.process_scale( + layer.w2_weight_scale.data, w2_bias = cls.process_scale( layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second ) if hasattr(layer, "w13_weight_scale_second"): @@ -336,7 +333,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: del layer.w13_weight_offset_second del layer.w2_weight_offset_second - self.update_bias(layer, w13_bias, w2_bias) + cls.update_bias(layer, w13_bias, w2_bias) layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) @@ -345,7 +342,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: @classmethod def apply( - self, + cls, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": @@ -358,8 +355,8 @@ def apply( top_k=topk_ids.shape[1] group_list_type = 1 - self.original_shape = hidden_states.shape - self.topk_weights = topk_weights + cls.original_shape = hidden_states.shape + cls.topk_weights = topk_weights num_tokens = hidden_states.shape[:-1].numel() @@ -367,7 +364,7 @@ def apply( last_expert_idx = 128 global_num_experts = 128 - sorted_hidden_states, self.expanded_row_idx, expert_tokens, pertoken_scale = ( + sorted_hidden_states, cls.expanded_row_idx, expert_tokens, pertoken_scale = ( torch.ops.npu.npu_moe_init_routing_v2( hidden_states, topk_ids, @@ -418,11 +415,10 @@ def apply( output_dtype=_output_dtype, )[0] - final_hidden_states = self.token_combine(hidden_states=output) + final_hidden_states = cls.token_combine(hidden_states=output) return StandardCombineInput(hidden_states=final_hidden_states) - @classmethod def token_combine(self, hidden_states: torch.Tensor, bias: torch.Tensor = None): @@ -441,9 +437,8 @@ def token_combine(self, self.expanded_row_idx = None return final_hidden_states - @classmethod + @staticmethod def apply_without_routing_weights( - self, layer, hidden_states, hidden_states_scale, From 0f81db38feb3d0ef18d92f2676fb2ca3f2743ea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 15 Dec 2025 11:50:57 +0300 Subject: [PATCH 026/175] Fix w8a8_static bug --- .../srt/layers/quantization/msmodelslim/msmodelslim.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index 383ca74c0f02..d9e9805a1c0b 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -202,6 +202,11 @@ def _get_scheme_from_parts( quant_config=self.quant_description, prefix=layer_name ) + elif quant_type == "W8A8": + return ModelSlimW8A8Int8( + quant_config=self.quant_description, + prefix=layer_name + ) elif quant_type == "W4A4_DYNAMIC": return ModelSlimW4A4Int4( quant_config=self.quant_description, From 3175d8b30103cf1ddb62dc8e4b10e943ace5ec36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 15 Dec 2025 12:32:10 +0300 Subject: [PATCH 027/175] Improving the code structure --- .../srt/layers/quantization/msmodelslim/msmodelslim.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index d9e9805a1c0b..4de4d04ec6b1 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -197,12 +197,7 @@ def _get_scheme_from_parts( ) -> ModelSlimScheme: quant_type = self.quant_description[layer_name + '.weight'] - if quant_type == "W8A8_DYNAMIC": - return ModelSlimW8A8Int8( - quant_config=self.quant_description, - prefix=layer_name - ) - elif quant_type == "W8A8": + if quant_type == "W8A8_DYNAMIC" or quant_type == "W8A8": return ModelSlimW8A8Int8( quant_config=self.quant_description, prefix=layer_name From 23db53f937b1a2c1f14ca17d2eb8d1e12cb05880 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 15 Dec 2025 13:51:08 +0300 Subject: [PATCH 028/175] Delete print() --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 3b7d5172541e..fe83dc771117 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -219,7 +219,6 @@ def create_weights( set_weight_attrs(w2_scale_bias, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - print(layer) NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(layer) def create_moe_runner( From 393f7d1d2167aee713c6852d2d9c6b106eac6379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 15 Dec 2025 13:52:23 +0300 Subject: [PATCH 029/175] Update w4a8 for MOE --- .../npu/quantization/fused_moe_method_npu.py | 61 ++++++++----------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 91a0c633e0db..54622a1e0873 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -142,7 +142,8 @@ def npu_fused_moe_without_routing_weights_bf16( class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase): - def release_weight_cache(self, weight: torch.Tensor): + @classmethod + def release_weight_cache(cls, weight: torch.Tensor): # .contiguous() introduces additional memory overhead and needs to be released using resize_(0) origin_weight = weight.data.transpose(1, 2) new_weight = origin_weight.contiguous() @@ -249,14 +250,11 @@ def apply_without_routing_weights( class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase): - def __init__(self) -> None: - self.group_size = 0 ### TODO or 256 - self.tp_size = 1 - self.is_per_channel_weight = self.group_size == 0 - - def process_scale(self, weight: torch.Tensor, scale, per_group_scale): + @classmethod + def process_scale(cls, weight: torch.Tensor, scale, per_group_scale): scale = scale.transpose(1, 2).contiguous() - if self.is_per_channel_weight: + #if cls.is_per_channel_weight: + if True: scale_np = scale.cpu().numpy() scale_np.dtype = np.uint32 scale_uint64_tensor = torch.from_numpy(scale_np.astype( @@ -286,7 +284,8 @@ def process_scale(self, weight: torch.Tensor, scale, per_group_scale): sscale_uint64_tensor = sscale_uint64_tensor.npu() return sscale_uint64_tensor, bias - def update_bias(self, layer, w13_bias, w2_bias): + @classmethod + def update_bias(cls, layer, w13_bias, w2_bias): layer.w13_scale_bias.data = ( layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) ) @@ -294,7 +293,8 @@ def update_bias(self, layer, w13_bias, w2_bias): layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) ) - def pack_to_int32(self, weight: torch.Tensor): + @classmethod + def pack_to_int32(cls, weight: torch.Tensor): # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4 assert ( weight.shape[-1] % 4 == 0 @@ -337,12 +337,11 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) - layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data) - layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) + layer.w13_weight.data = cls.pack_to_int32(layer.w13_weight.data) + layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data) - @classmethod + staticmethod def apply( - cls, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": @@ -354,9 +353,8 @@ def apply( topk_weights, topk_ids, _ = topk_output top_k=topk_ids.shape[1] group_list_type = 1 - - cls.original_shape = hidden_states.shape - cls.topk_weights = topk_weights + original_shape = hidden_states.shape + topk_weights = topk_weights num_tokens = hidden_states.shape[:-1].numel() @@ -364,7 +362,7 @@ def apply( last_expert_idx = 128 global_num_experts = 128 - sorted_hidden_states, cls.expanded_row_idx, expert_tokens, pertoken_scale = ( + sorted_hidden_states, expanded_row_idx, expert_tokens, pertoken_scale = ( torch.ops.npu.npu_moe_init_routing_v2( hidden_states, topk_ids, @@ -415,30 +413,19 @@ def apply( output_dtype=_output_dtype, )[0] - final_hidden_states = cls.token_combine(hidden_states=output) + assert original_shape is not None + final_hidden_states = torch.ops.npu.npu_moe_token_unpermute( + permuted_tokens=output, + sorted_indices=torch.abs(expanded_row_idx), + probs=topk_weights) + if len(original_shape) == 3: + final_hidden_states = final_hidden_states.view(original_shape) return StandardCombineInput(hidden_states=final_hidden_states) - def token_combine(self, - hidden_states: torch.Tensor, - bias: torch.Tensor = None): - assert self.original_shape is not None - final_hidden_states = torch.ops.npu.npu_moe_token_unpermute( - permuted_tokens=hidden_states, - sorted_indices=torch.abs(self.expanded_row_idx), - probs=self.topk_weights) - if len(self.original_shape) == 3: - final_hidden_states = final_hidden_states.view(self.original_shape) - - # these values are no longer used, so they need to be set to None for memory release. - self.expert_map = None - self.topk_weights = None - self.topk_ids = None - self.expanded_row_idx = None - return final_hidden_states - @staticmethod def apply_without_routing_weights( + cls, layer, hidden_states, hidden_states_scale, From d4d53e084e3f13cbc6c9d16e8412fe121a828aea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 16 Dec 2025 14:42:47 +0300 Subject: [PATCH 030/175] Fix w4a4 weights loading --- .../schemes/msmodelslim_w4a4_int4.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py index 1d633fcbb06a..3bbbf4af1f2d 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py @@ -34,6 +34,23 @@ def __init__( self.quant_config[prefix + ".weight"] == "W4A4_DYNAMIC" ) + + @staticmethod + def get_weight( + input_size: int, output_size: int, params_dtype: torch.dtype + ) -> Dict[str, Any]: + params_dict = {"weight": torch.empty(output_size, input_size, dtype=torch.int8)} + return params_dict + + @staticmethod + def get_perchannel_param( + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: + params_dict = {} + params_dict["weight_scale"] = torch.empty(output_size, 1, dtype=params_dtype) + params_dict["weight_offset"] = torch.empty(output_size, 1, dtype=params_dtype) + return params_dict def create_weights( self, @@ -47,17 +64,15 @@ def create_weights( ) -> None: output_size_per_partition = sum(output_partition_sizes) weight_loader = extra_weight_attrs.get("weight_loader") - - weight_dict = self.quant_method.get_weight( - input_size_per_partition, output_size_per_partition, params_dtype - ) + + weight_dict = {"weight": torch.empty(output_size_per_partition, input_size_per_partition, dtype=torch.int8)} for weight_name, weight_param in weight_dict.items(): param = torch.nn.Parameter(weight_param, requires_grad=False) set_weight_attrs(param, {"input_dim": 1, "output_dim": 0}) layer.register_parameter(weight_name, param) set_weight_attrs(param, extra_weight_attrs) - pertensor_dict = self.quant_method.get_pertensor_param(params_dtype) + pertensor_dict = {} for pertensor_name, pertensor_param in pertensor_dict.items(): param = PerTensorScaleParameter( data=pertensor_param, weight_loader=weight_loader @@ -65,10 +80,10 @@ def create_weights( # disable warning param.ignore_warning = True layer.register_parameter(pertensor_name, param) - - perchannel_dict = self.quant_method.get_perchannel_param( - output_size_per_partition, params_dtype - ) + + perchannel_dict = {} + perchannel_dict["weight_scale"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype) + perchannel_dict["weight_offset"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype) for perchannel_name, perchannel_param in perchannel_dict.items(): param = torch.nn.Parameter(perchannel_param, requires_grad=False) set_weight_attrs(param, {"output_dim": 0}) @@ -84,4 +99,4 @@ def apply_weights( x: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - return NPU_W4A4DynamicLinearMethod.apply(layer, x, bias) \ No newline at end of file + return NPU_W4A4DynamicLinearMethod.apply(layer, x, bias) From 2bb7acf021d69c3badb695dc18fb71f682d85d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 16 Dec 2025 14:48:03 +0300 Subject: [PATCH 031/175] Update model_config.py --- python/sglang/srt/configs/model_config.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 1361d58abe41..991df9756b7f 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -589,12 +589,11 @@ def _parse_quant_hf_config(self): return quant_cfg def _find_quant_modelslim_config(self): - quant_config_file = Path(self.model_path, "quant_model_description.json") - if quant_config_file.is_file(): - with open(quant_config_file) as f: + quant_config_file = Path(self.model_path, "quant_model_description.json") + quant_cfg = None + if quant_config_file.is_file(): + with open(quant_config_file) as f: quant_cfg = json.load(f) - else: - quant_cfg = None return quant_cfg From 4a05e5d361f1ce179d3e12317bf1a451fc5602f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 16 Dec 2025 14:52:50 +0300 Subject: [PATCH 032/175] Add w4a4 test --- .../ascend/test_ascend_w4a4_quantization.py | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 test/manual/ascend/test_ascend_w4a4_quantization.py diff --git a/test/manual/ascend/test_ascend_w4a4_quantization.py b/test/manual/ascend/test_ascend_w4a4_quantization.py new file mode 100644 index 000000000000..c2251ec94a9d --- /dev/null +++ b/test/manual/ascend/test_ascend_w4a4_quantization.py @@ -0,0 +1,108 @@ +""" +Usage: +python3 -m unittest test_ascend_w4a4_quantization.TestAscendW4A4.test_gsm8k +""" + +import os +import time +import unittest +from types import SimpleNamespace +from urllib.parse import urlparse + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_ci, + popen_launch_server, +) + +if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" +DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( + 7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 +) +DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" + + +class TestAscendW4A4(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = "/root/.cache/modelscope/hub/models/msit/Qwen3-8B-W4A4/" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--device", + "npu", + "--attention-backend", + "ascend", + "--tp-size", + "2", + "--mem-fraction-static", + "0.8", + "--cuda-graph-bs", + "64", + "--disable-radix-cache", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + base_url = DEFAULT_URL_FOR_TEST + url = urlparse(base_url) + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=128, + max_new_tokens=512, + parallel=64, + host=f"http://{url.hostname}", + port=int(url.port), + ) + metrics = run_eval(args) + print(metrics) + + self.assertGreaterEqual(metrics["accuracy"], 0.75) + self.assertGreaterEqual(metrics["output_throughput"], 700) + + def run_decode(self, max_new_tokens): + response = requests.post( + self.base_url + "/generate", + json={ + "text": "The capital of France is", + "sampling_params": { + "temperature": 0, + "max_new_tokens": max_new_tokens, + }, + "ignore_eos": True, + }, + ) + return response.json() + + def test_throughput(self): + max_tokens = 256 + + tic = time.perf_counter() + res = self.run_decode(max_tokens) + tok = time.perf_counter() + print(res["text"]) + throughput = max_tokens / (tok - tic) + print(f"Throughput: {throughput} tokens/s") + + if is_in_ci(): + self.assertGreaterEqual(throughput, 25) + + +if __name__ == "__main__": + unittest.main() From d0a577fb4ff0cdd9856dfdca51f7a4c19f1d2d3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 16 Dec 2025 14:54:29 +0300 Subject: [PATCH 033/175] Add compressed-tensors unit-test --- .../ascend/test_ascend_w8a8_quantization.py | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/test/manual/ascend/test_ascend_w8a8_quantization.py b/test/manual/ascend/test_ascend_w8a8_quantization.py index bf139f46a872..5c1bc3b66562 100644 --- a/test/manual/ascend/test_ascend_w8a8_quantization.py +++ b/test/manual/ascend/test_ascend_w8a8_quantization.py @@ -98,6 +98,76 @@ def test_throughput(self): if is_in_ci(): self.assertGreaterEqual(throughput, 25) + + +class TestAscendW8A8CompressedTensors(CustomTestCase): + @classmethod + def setUpClass(cls): + #TODO: Move model to CI or Modelscope + cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--disable-cuda-graph", + "--device", + "npu", + "--attention-backend", + "ascend", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + base_url = DEFAULT_URL_FOR_TEST + url = urlparse(base_url) + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host=f"http://{url.hostname}", + port=int(url.port), + ) + metrics = run_eval(args) + print(metrics) + + self.assertGreaterEqual(metrics["accuracy"], 0.3) + self.assertGreaterEqual(metrics["output_throughput"], 1000) + + def run_decode(self, max_new_tokens): + response = requests.post( + self.base_url + "/generate", + json={ + "text": "The capital of France is", + "sampling_params": { + "temperature": 0, + "max_new_tokens": max_new_tokens, + }, + "ignore_eos": True, + }, + ) + return response.json() + + def test_throughput(self): + max_tokens = 256 + + tic = time.perf_counter() + res = self.run_decode(max_tokens) + tok = time.perf_counter() + print(res["text"]) + throughput = max_tokens / (tok - tic) + print(f"Throughput: {throughput} tokens/s") + + if is_in_ci(): + self.assertGreaterEqual(throughput, 25) if __name__ == "__main__": From 77a923e4ea63dc6b55c49a5fdbba9ac1fc777fe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A1=D0=B0=D0=B2=D0=BA=D0=B8=D0=BD=20=D0=90=D1=80=D1=82?= =?UTF-8?q?=D0=B5=D0=BC?= Date: Wed, 17 Dec 2025 13:25:44 +0300 Subject: [PATCH 034/175] Pre-commit fixes --- python/sglang/srt/configs/model_config.py | 17 +- .../npu/quantization/fused_moe_method_npu.py | 20 ++- .../npu/quantization/linear_method_npu.py | 11 +- .../compressed_tensors/compressed_tensors.py | 4 +- .../schemes/compressed_tensors_w8a8_int8.py | 27 +-- .../quantization/msmodelslim/msmodelslim.py | 158 +++++++++--------- .../msmodelslim/msmodelslim_moe.py | 63 ++++--- .../msmodelslim/schemes/__init__.py | 2 +- .../schemes/msmodelslim_w4a4_int4.py | 48 +++--- .../schemes/msmodelslim_w8a8_int8.py | 27 ++- .../ascend/test_ascend_w8a8_quantization.py | 4 +- 11 files changed, 175 insertions(+), 206 deletions(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index a4253b9f59ce..8e7e98a77d0b 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -16,8 +16,8 @@ import logging import math import os -from pathlib import Path from enum import Enum, IntEnum, auto +from pathlib import Path from typing import Any, List, Optional, Set, Union import torch @@ -26,7 +26,7 @@ from sglang.srt.environ import envs from sglang.srt.layers.quantization import QUANTIZATION_METHODS from sglang.srt.server_args import ServerArgs -from sglang.srt.utils import is_hip, retry, is_npu +from sglang.srt.utils import is_hip, is_npu, retry from sglang.srt.utils.hf_transformers_utils import ( get_config, get_context_length, @@ -39,6 +39,7 @@ logger = logging.getLogger(__name__) _is_npu = is_npu() + class AttentionArch(IntEnum): MLA = auto() MHA = auto() @@ -596,12 +597,12 @@ def _parse_quant_hf_config(self): return quant_cfg def _find_quant_modelslim_config(self): - quant_config_file = Path(self.model_path, "quant_model_description.json") - quant_cfg = None - if quant_config_file.is_file(): - with open(quant_config_file) as f: + quant_config_file = Path(self.model_path, "quant_model_description.json") + quant_cfg = None + if quant_config_file.is_file(): + with open(quant_config_file) as f: quant_cfg = json.load(f) - + return quant_cfg def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]: @@ -724,7 +725,7 @@ def _verify_quantization(self) -> None: quant_cfg = self._parse_quant_hf_config() if _is_npu: quant_cfg = self._find_quant_modelslim_config() - self.quantization = 'modelslim' + self.quantization = "modelslim" if quant_cfg is not None: quant_method = quant_cfg.get( diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 54622a1e0873..dfbab790d1ed 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -199,7 +199,7 @@ def apply( top_k=topk_ids.shape[1], ) return StandardCombineInput(hidden_states=output) - + @staticmethod def apply_without_routing_weights( layer, @@ -253,12 +253,11 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase): @classmethod def process_scale(cls, weight: torch.Tensor, scale, per_group_scale): scale = scale.transpose(1, 2).contiguous() - #if cls.is_per_channel_weight: + # if cls.is_per_channel_weight: if True: scale_np = scale.cpu().numpy() scale_np.dtype = np.uint32 - scale_uint64_tensor = torch.from_numpy(scale_np.astype( - np.int64)).npu() + scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu() return scale_uint64_tensor, None per_group_scale = per_group_scale.transpose(1, 2).contiguous() group_num, k, n = weight.shape @@ -341,6 +340,7 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data) staticmethod + def apply( layer, dispatch_output: "StandardDispatchOutput", @@ -351,11 +351,11 @@ def apply( topk_output = dispatch_output.topk_output topk_weights, topk_ids, _ = topk_output - top_k=topk_ids.shape[1] + top_k = topk_ids.shape[1] group_list_type = 1 original_shape = hidden_states.shape topk_weights = topk_weights - + num_tokens = hidden_states.shape[:-1].numel() first_expert_idx = 0 @@ -372,7 +372,8 @@ def apply( expert_tokens_num_flag=True, active_expert_range=[first_expert_idx, last_expert_idx], quant_mode=1, - )) + ) + ) expert_tokens = expert_tokens.to(torch.int64) @@ -382,7 +383,7 @@ def apply( w2_scale = [layer.w2_weight_scale] # TODO w4a8 scene: dynamic acquisition of dtype in the future _output_dtype = torch.bfloat16 - + hidden_states = torch.ops.npu.npu_grouped_matmul( x=[sorted_hidden_states], weight=[layer.w13_weight], @@ -417,7 +418,8 @@ def apply( final_hidden_states = torch.ops.npu.npu_moe_token_unpermute( permuted_tokens=output, sorted_indices=torch.abs(expanded_row_idx), - probs=topk_weights) + probs=topk_weights, + ) if len(original_shape) == 3: final_hidden_states = final_hidden_states.view(original_shape) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py index 6481b4f79bf4..681d45d18f0b 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py @@ -1,13 +1,8 @@ -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Optional import torch from sglang.srt.hardware_backend.npu.utils import npu_format_cast -from sglang.srt.layers.parameter import ( - ChannelQuantScaleParameter, - ModelWeightParameter, - PerTensorScaleParameter, -) from sglang.srt.layers.quantization.base_config import LinearMethodBase if TYPE_CHECKING: @@ -25,7 +20,7 @@ def __init__( class NPUW8A8Int8LinearMethod(_NPULinearMethodBase): - + @staticmethod def apply( layer: torch.nn.Module, @@ -139,4 +134,4 @@ def process_weights_after_loading(layer): layer.weight_offset.data = layer.weight_offset.data.flatten() layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack( layer.weight.data.to(torch.int32) - ) \ No newline at end of file + ) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index 864bc91cc838..4e10c5d734eb 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -32,10 +32,10 @@ CompressedTensorsScheme, CompressedTensorsW4A4Fp4, CompressedTensorsW8A8Fp8, - GPUCompressedTensorsW8A8Int8, - NPUCompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16, + GPUCompressedTensorsW8A8Int8, + NPUCompressedTensorsW8A8Int8, ) from sglang.srt.layers.quantization.compressed_tensors.utils import ( find_matched_target, diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index fb12922df3be..64401aea6a71 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -7,6 +7,10 @@ from compressed_tensors.quantization import QuantizationStrategy from torch.nn import Parameter +from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( + NPUW8A8Int8DynamicLinearMethod, + NPUW8A8Int8LinearMethod, +) from sglang.srt.layers.parameter import ( ChannelQuantScaleParameter, ModelWeightParameter, @@ -15,10 +19,6 @@ from sglang.srt.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme, ) -from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( - NPUW8A8Int8DynamicLinearMethod, - NPUW8A8Int8LinearMethod -) from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8 from sglang.srt.layers.quantization.utils import requantize_with_max_scale from sglang.srt.utils import is_cuda @@ -94,15 +94,11 @@ def create_weights( class GPUCompressedTensorsW8A8Int8(CompressedTensorsScheme): - + def __init__( self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool ): - super.__init__( - strategy, - is_static_input_scheme, - input_symmetric - ) + super.__init__(strategy, is_static_input_scheme, input_symmetric) @classmethod def get_min_capability(cls) -> int: @@ -190,15 +186,11 @@ def apply_weights( class NPUCompressedTensorsW8A8Int8(CompressedTensorsScheme): - + def __init__( self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool ): - super.__init__( - strategy, - is_static_input_scheme, - input_symmetric - ) + super.__init__(strategy, is_static_input_scheme, input_symmetric) @classmethod def get_min_capability(cls) -> int: @@ -209,10 +201,9 @@ def process_weights_after_loading(self, layer): return NPUW8A8Int8LinearMethod.process_weights_after_loading(layer) else: return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) - + def apply_weights(self, layer, x, bias): if self.is_static_input_scheme: return NPUW8A8Int8LinearMethod.apply(layer) else: return NPUW8A8Int8DynamicLinearMethod.apply(layer) - diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index 4de4d04ec6b1..7825b3fd2027 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -5,42 +5,36 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast import torch -from compressed_tensors.quantization import QuantizationStrategy -from pydantic import BaseModel # from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( # NPUW4A8Int4DynamicMoEMethod, # NPUW4A16Int4DynamicMoEMethod, # NPUW8A8Int8DynamicMoEMethod, # ) -from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( - _NPULinearMethodBase - # NPUW8A8Int8DynamicLinearMethod, - # NPUW8A8Int8LinearMethod, +from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( # NPUW8A8Int8DynamicLinearMethod,; NPUW8A8Int8LinearMethod, + _NPULinearMethodBase, ) from sglang.srt.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, ) +from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import ( ModelSlimMoEMethod, ) from sglang.srt.layers.quantization.msmodelslim.schemes import ( ModelSlimScheme, - ModelSlimW8A8Int8, ModelSlimW4A4Int4, + ModelSlimW8A8Int8, ) -from sglang.srt.layers.quantization.compressed_tensors.utils import ( - find_matched_target, - is_activation_quantization_format, - should_ignore_layer -) -#from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer + +# from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod from sglang.srt.utils import apply_module_patch logger = logging.getLogger(__name__) + # func refers to RMSNorm.__init__ def npu_wrapper_rmsnorm_init(func): def init(self, hidden_size: int, **extra_args) -> None: @@ -51,6 +45,7 @@ def init(self, hidden_size: int, **extra_args) -> None: return init + # func refers to RMSNorm.forward_oot def npu_wrapper_rmsnorm_forward(func): def _rmsnorm_forward_oot( @@ -122,7 +117,7 @@ def __init__(self, quant_config: Dict[str, Any] = {}): "forward_npu", [npu_wrapper_rmsnorm_forward], ) - + def get_linear_method(self) -> ModelSlimLinearMethod: return ModelSlimLinearMethod(self) @@ -183,29 +178,28 @@ def get_quant_method( return UnquantizedLinearMethod() scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config) if scheme is None: - raise NotImplementedError("At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes.") + raise NotImplementedError( + "At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes." + ) layer.scheme = scheme - return ( - ModelSlimLinearMethod(self) - ) + return ModelSlimLinearMethod(self) elif isinstance(layer, FusedMoE): return ModelSlimMoEMethod.get_moe_method(self, layer, prefix) return None def _get_scheme_from_parts( - self, layer_name: str, - ) -> ModelSlimScheme: + self, + layer_name: str, + ) -> ModelSlimScheme: - quant_type = self.quant_description[layer_name + '.weight'] + quant_type = self.quant_description[layer_name + ".weight"] if quant_type == "W8A8_DYNAMIC" or quant_type == "W8A8": return ModelSlimW8A8Int8( - quant_config=self.quant_description, - prefix=layer_name + quant_config=self.quant_description, prefix=layer_name ) elif quant_type == "W4A4_DYNAMIC": return ModelSlimW4A4Int4( - quant_config=self.quant_description, - prefix=layer_name + quant_config=self.quant_description, prefix=layer_name ) # Detect If Mixed Precision @@ -225,66 +219,66 @@ def _get_scheme_from_parts( # "Other method (CompressedTensorsW4A16Sparse24) is not supported now" # ) - #if is_activation_quantization_format(self.quant_format): - # if self._is_fp8_w8a8(weight_quant, input_quant): - # is_fp8_w8a8_supported = self._check_scheme_supported( - # CompressedTensorsW8A8Fp8.get_min_capability(), error=False - # ) - # if is_fp8_w8a8_supported: - # return CompressedTensorsW8A8Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=( - # input_quant and not input_quant.dynamic - # ), - # ) - # else: - # # note: input_quant will be present for converted models; - # # will be ignored during inference post loading - # return CompressedTensorsW8A16Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=not input_quant.dynamic, - # ) - - # # note: input_quant can be None - # if self._is_fp8_w8a16(weight_quant, input_quant): - # is_static_input_scheme = input_quant and not input_quant.dynamic - # return CompressedTensorsW8A16Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=is_static_input_scheme, - # ) - - #raise NotImplementedError("No msmodelslim compatible scheme was found.") - - def get_scheme( - self, layer: torch.nn.Module, layer_name: Optional[str] = None - ) -> Optional[ModelSlimScheme]: - """ - get_scheme method adjusted for modelslim, taken from - python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py - """ - # if self.target_scheme_map: - # matched_target = find_matched_target( - # layer_name=layer_name, - # module=layer, - # targets=self.target_scheme_map.keys(), - # fused_mapping=self.packed_modules_mapping, + # if is_activation_quantization_format(self.quant_format): + # if self._is_fp8_w8a8(weight_quant, input_quant): + # is_fp8_w8a8_supported = self._check_scheme_supported( + # CompressedTensorsW8A8Fp8.get_min_capability(), error=False # ) + # if is_fp8_w8a8_supported: + # return CompressedTensorsW8A8Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=( + # input_quant and not input_quant.dynamic + # ), + # ) + # else: + # # note: input_quant will be present for converted models; + # # will be ignored during inference post loading + # return CompressedTensorsW8A16Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=not input_quant.dynamic, + # ) - # scheme_dict = self.target_scheme_map[matched_target] - # weight_quant = scheme_dict.get("weights") - # input_quant = scheme_dict.get("input_activations") - # else: - # Find the quant_scheme - scheme = self._get_scheme_from_parts( # type: ignore - # weight_quant=weight_quant, - # input_quant=input_quant, - layer_name=layer_name, - ) + # # note: input_quant can be None + # if self._is_fp8_w8a16(weight_quant, input_quant): + # is_static_input_scheme = input_quant and not input_quant.dynamic + # return CompressedTensorsW8A16Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=is_static_input_scheme, + # ) + + # raise NotImplementedError("No msmodelslim compatible scheme was found.") + + def get_scheme( + self, layer: torch.nn.Module, layer_name: Optional[str] = None + ) -> Optional[ModelSlimScheme]: + """ + get_scheme method adjusted for modelslim, taken from + python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py + """ + # if self.target_scheme_map: + # matched_target = find_matched_target( + # layer_name=layer_name, + # module=layer, + # targets=self.target_scheme_map.keys(), + # fused_mapping=self.packed_modules_mapping, + # ) + + # scheme_dict = self.target_scheme_map[matched_target] + # weight_quant = scheme_dict.get("weights") + # input_quant = scheme_dict.get("input_activations") + # else: + # Find the quant_scheme + scheme = self._get_scheme_from_parts( # type: ignore + # weight_quant=weight_quant, + # input_quant=input_quant, + layer_name=layer_name, + ) - # Ascend doesn't support device capability - # self._check_scheme_supported(scheme.get_min_capability()) - logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name) - return scheme + # Ascend doesn't support device capability + # self._check_scheme_supported(scheme.get_min_capability()) + logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name) + return scheme def is_layer_skipped( self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index fe83dc771117..4b7b596c4f8d 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -2,25 +2,16 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import enum import logging -from enum import Enum -from typing import Callable, Optional, TYPE_CHECKING -from typing import Any, Dict, List +from typing import TYPE_CHECKING, Any, Dict import torch -from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo -from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase -from sglang.srt.layers.quantization.msmodelslim.schemes import ( - ModelSlimScheme, -) -from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( NPUW4A8Int8DynamicMoEMethod, NPUW8A8Int8DynamicMoEMethod, ) - +from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase from sglang.srt.utils import set_weight_attrs if TYPE_CHECKING: @@ -29,9 +20,7 @@ CombineInput, StandardDispatchOutput, ) - from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ( - ModelSlimConfig, - ) + from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig logger = logging.getLogger(__name__) @@ -73,7 +62,9 @@ def get_moe_method( class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod): def __init__( - self, quant_config: Dict[str, Any], prefix: str = None, + self, + quant_config: Dict[str, Any], + prefix: str = None, ): self.quant_config = quant_config self.group_size = 0 @@ -173,7 +164,9 @@ def create_weights( ), requires_grad=False, ) - layer.register_parameter("w13_weight_offset_second", w13_weight_offset_second) + layer.register_parameter( + "w13_weight_offset_second", w13_weight_offset_second + ) set_weight_attrs(w13_weight_offset_second, extra_weight_attrs) w2_weight_scale_second = torch.nn.Parameter( @@ -225,14 +218,13 @@ def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" ): self.moe_runner_config = moe_runner_config - + def apply( self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": return NPUW4A8Int8DynamicMoEMethod.apply(layer, dispatch_output) - def apply_without_routing_weights( self, @@ -243,18 +235,22 @@ def apply_without_routing_weights( group_list, output_dtype, ): - return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(layer, - hidden_states, - hidden_states_scale, - group_list_type, - group_list, - output_dtype,) + return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights( + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ) class ModelSlimW8A8Int8MoE(ModelSlimMoEMethod): def __init__( - self, quant_config: Dict[str, Any], prefix: str = None, + self, + quant_config: Dict[str, Any], + prefix: str = None, ): self.quant_config = quant_config @@ -335,14 +331,13 @@ def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" ): self.moe_runner_config = moe_runner_config - + def apply( self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output) - def apply_without_routing_weights( self, @@ -353,9 +348,11 @@ def apply_without_routing_weights( group_list, output_dtype, ): - return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(layer, - hidden_states, - hidden_states_scale, - group_list_type, - group_list, - output_dtype,) + return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights( + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py index 997892772977..fba516eed7c0 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 from .msmodelslim_scheme import ModelSlimScheme -from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8 from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4 +from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8 __all__ = [ "ModelSlimScheme", diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py index 3bbbf4af1f2d..1b578837c8d4 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py @@ -1,40 +1,28 @@ # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Any, Dict, List, Optional import torch -from torch.nn import Parameter -from typing import Any, Dict, List - -from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( NPU_W4A4DynamicLinearMethod, ) -from sglang.srt.layers.parameter import ( - ChannelQuantScaleParameter, - ModelWeightParameter, - PerTensorScaleParameter, -) -from sglang.srt.layers.quantization.msmodelslim.schemes import ( - ModelSlimScheme, -) - +from sglang.srt.layers.parameter import PerTensorScaleParameter +from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme from sglang.srt.utils import set_weight_attrs class ModelSlimW4A4Int4(ModelSlimScheme): def __init__( - self, quant_config: Dict[str, any], prefix: str, + self, + quant_config: Dict[str, any], + prefix: str, ): self.quant_config = quant_config - self.is_dynamic = ( - self.quant_config[prefix + ".weight"] - == "W4A4_DYNAMIC" - ) - + self.is_dynamic = self.quant_config[prefix + ".weight"] == "W4A4_DYNAMIC" + @staticmethod def get_weight( input_size: int, output_size: int, params_dtype: torch.dtype @@ -64,8 +52,12 @@ def create_weights( ) -> None: output_size_per_partition = sum(output_partition_sizes) weight_loader = extra_weight_attrs.get("weight_loader") - - weight_dict = {"weight": torch.empty(output_size_per_partition, input_size_per_partition, dtype=torch.int8)} + + weight_dict = { + "weight": torch.empty( + output_size_per_partition, input_size_per_partition, dtype=torch.int8 + ) + } for weight_name, weight_param in weight_dict.items(): param = torch.nn.Parameter(weight_param, requires_grad=False) set_weight_attrs(param, {"input_dim": 1, "output_dim": 0}) @@ -80,10 +72,14 @@ def create_weights( # disable warning param.ignore_warning = True layer.register_parameter(pertensor_name, param) - + perchannel_dict = {} - perchannel_dict["weight_scale"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype) - perchannel_dict["weight_offset"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype) + perchannel_dict["weight_scale"] = torch.empty( + output_size_per_partition, 1, dtype=params_dtype + ) + perchannel_dict["weight_offset"] = torch.empty( + output_size_per_partition, 1, dtype=params_dtype + ) for perchannel_name, perchannel_param in perchannel_dict.items(): param = torch.nn.Parameter(perchannel_param, requires_grad=False) set_weight_attrs(param, {"output_dim": 0}) @@ -92,7 +88,7 @@ def create_weights( def process_weights_after_loading(self, layer): NPU_W4A4DynamicLinearMethod.process_weights_after_loading(layer) - + def apply_weights( self, layer: torch.nn.Module, diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py index b33764b858a9..de99c9fed0b7 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py @@ -1,38 +1,31 @@ # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Dict, List, Optional import torch -from torch.nn import Parameter -from typing import Any, Dict, List - -from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( NPUW8A8Int8DynamicLinearMethod, - NPUW8A8Int8LinearMethod + NPUW8A8Int8LinearMethod, ) from sglang.srt.layers.parameter import ( ChannelQuantScaleParameter, ModelWeightParameter, PerTensorScaleParameter, ) -from sglang.srt.layers.quantization.msmodelslim.schemes import ( - ModelSlimScheme, -) +from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme class ModelSlimW8A8Int8(ModelSlimScheme): def __init__( - self, quant_config: Dict[str, any], prefix: str, + self, + quant_config: Dict[str, any], + prefix: str, ): self.quant_config = quant_config - self.is_dynamic = ( - self.quant_config[prefix + ".weight"] - == "W8A8_DYNAMIC" - ) + self.is_dynamic = self.quant_config[prefix + ".weight"] == "W8A8_DYNAMIC" def create_weights( self, @@ -70,7 +63,7 @@ def create_weights( weight_loader=weight_loader, ) layer.register_parameter("weight_offset", weight_offset) - + if not self.is_dynamic: input_scale = PerTensorScaleParameter( data=torch.empty(1, dtype=params_dtype), @@ -111,7 +104,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module): NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) else: NPUW8A8Int8LinearMethod.process_weights_after_loading(layer) - + def apply_weights( self, layer: torch.nn.Module, @@ -121,4 +114,4 @@ def apply_weights( if self.is_dynamic: return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias) else: - return NPUW8A8Int8LinearMethod.apply(layer, x, bias) \ No newline at end of file + return NPUW8A8Int8LinearMethod.apply(layer, x, bias) diff --git a/test/manual/ascend/test_ascend_w8a8_quantization.py b/test/manual/ascend/test_ascend_w8a8_quantization.py index 5c1bc3b66562..959bf88a513f 100644 --- a/test/manual/ascend/test_ascend_w8a8_quantization.py +++ b/test/manual/ascend/test_ascend_w8a8_quantization.py @@ -98,12 +98,12 @@ def test_throughput(self): if is_in_ci(): self.assertGreaterEqual(throughput, 25) - + class TestAscendW8A8CompressedTensors(CustomTestCase): @classmethod def setUpClass(cls): - #TODO: Move model to CI or Modelscope + # TODO: Move model to CI or Modelscope cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8" cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( From 39179198f4be81f3ea19a808f8def4e7f01220d0 Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Wed, 17 Dec 2025 13:29:34 +0300 Subject: [PATCH 035/175] Revert "Pre-commit fixes" This reverts commit 77a923e4ea63dc6b55c49a5fdbba9ac1fc777fe3. --- python/sglang/srt/configs/model_config.py | 17 +- .../npu/quantization/fused_moe_method_npu.py | 20 +-- .../npu/quantization/linear_method_npu.py | 11 +- .../compressed_tensors/compressed_tensors.py | 4 +- .../schemes/compressed_tensors_w8a8_int8.py | 27 ++- .../quantization/msmodelslim/msmodelslim.py | 158 +++++++++--------- .../msmodelslim/msmodelslim_moe.py | 63 +++---- .../msmodelslim/schemes/__init__.py | 2 +- .../schemes/msmodelslim_w4a4_int4.py | 48 +++--- .../schemes/msmodelslim_w8a8_int8.py | 27 +-- .../ascend/test_ascend_w8a8_quantization.py | 4 +- 11 files changed, 206 insertions(+), 175 deletions(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 8e7e98a77d0b..a4253b9f59ce 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -16,8 +16,8 @@ import logging import math import os -from enum import Enum, IntEnum, auto from pathlib import Path +from enum import Enum, IntEnum, auto from typing import Any, List, Optional, Set, Union import torch @@ -26,7 +26,7 @@ from sglang.srt.environ import envs from sglang.srt.layers.quantization import QUANTIZATION_METHODS from sglang.srt.server_args import ServerArgs -from sglang.srt.utils import is_hip, is_npu, retry +from sglang.srt.utils import is_hip, retry, is_npu from sglang.srt.utils.hf_transformers_utils import ( get_config, get_context_length, @@ -39,7 +39,6 @@ logger = logging.getLogger(__name__) _is_npu = is_npu() - class AttentionArch(IntEnum): MLA = auto() MHA = auto() @@ -597,12 +596,12 @@ def _parse_quant_hf_config(self): return quant_cfg def _find_quant_modelslim_config(self): - quant_config_file = Path(self.model_path, "quant_model_description.json") - quant_cfg = None - if quant_config_file.is_file(): - with open(quant_config_file) as f: + quant_config_file = Path(self.model_path, "quant_model_description.json") + quant_cfg = None + if quant_config_file.is_file(): + with open(quant_config_file) as f: quant_cfg = json.load(f) - + return quant_cfg def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]: @@ -725,7 +724,7 @@ def _verify_quantization(self) -> None: quant_cfg = self._parse_quant_hf_config() if _is_npu: quant_cfg = self._find_quant_modelslim_config() - self.quantization = "modelslim" + self.quantization = 'modelslim' if quant_cfg is not None: quant_method = quant_cfg.get( diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index dfbab790d1ed..54622a1e0873 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -199,7 +199,7 @@ def apply( top_k=topk_ids.shape[1], ) return StandardCombineInput(hidden_states=output) - + @staticmethod def apply_without_routing_weights( layer, @@ -253,11 +253,12 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase): @classmethod def process_scale(cls, weight: torch.Tensor, scale, per_group_scale): scale = scale.transpose(1, 2).contiguous() - # if cls.is_per_channel_weight: + #if cls.is_per_channel_weight: if True: scale_np = scale.cpu().numpy() scale_np.dtype = np.uint32 - scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu() + scale_uint64_tensor = torch.from_numpy(scale_np.astype( + np.int64)).npu() return scale_uint64_tensor, None per_group_scale = per_group_scale.transpose(1, 2).contiguous() group_num, k, n = weight.shape @@ -340,7 +341,6 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data) staticmethod - def apply( layer, dispatch_output: "StandardDispatchOutput", @@ -351,11 +351,11 @@ def apply( topk_output = dispatch_output.topk_output topk_weights, topk_ids, _ = topk_output - top_k = topk_ids.shape[1] + top_k=topk_ids.shape[1] group_list_type = 1 original_shape = hidden_states.shape topk_weights = topk_weights - + num_tokens = hidden_states.shape[:-1].numel() first_expert_idx = 0 @@ -372,8 +372,7 @@ def apply( expert_tokens_num_flag=True, active_expert_range=[first_expert_idx, last_expert_idx], quant_mode=1, - ) - ) + )) expert_tokens = expert_tokens.to(torch.int64) @@ -383,7 +382,7 @@ def apply( w2_scale = [layer.w2_weight_scale] # TODO w4a8 scene: dynamic acquisition of dtype in the future _output_dtype = torch.bfloat16 - + hidden_states = torch.ops.npu.npu_grouped_matmul( x=[sorted_hidden_states], weight=[layer.w13_weight], @@ -418,8 +417,7 @@ def apply( final_hidden_states = torch.ops.npu.npu_moe_token_unpermute( permuted_tokens=output, sorted_indices=torch.abs(expanded_row_idx), - probs=topk_weights, - ) + probs=topk_weights) if len(original_shape) == 3: final_hidden_states = final_hidden_states.view(original_shape) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py index 681d45d18f0b..6481b4f79bf4 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py @@ -1,8 +1,13 @@ -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, List, Optional import torch from sglang.srt.hardware_backend.npu.utils import npu_format_cast +from sglang.srt.layers.parameter import ( + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) from sglang.srt.layers.quantization.base_config import LinearMethodBase if TYPE_CHECKING: @@ -20,7 +25,7 @@ def __init__( class NPUW8A8Int8LinearMethod(_NPULinearMethodBase): - + @staticmethod def apply( layer: torch.nn.Module, @@ -134,4 +139,4 @@ def process_weights_after_loading(layer): layer.weight_offset.data = layer.weight_offset.data.flatten() layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack( layer.weight.data.to(torch.int32) - ) + ) \ No newline at end of file diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index 4e10c5d734eb..864bc91cc838 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -32,10 +32,10 @@ CompressedTensorsScheme, CompressedTensorsW4A4Fp4, CompressedTensorsW8A8Fp8, - CompressedTensorsW8A16Fp8, - CompressedTensorsWNA16, GPUCompressedTensorsW8A8Int8, NPUCompressedTensorsW8A8Int8, + CompressedTensorsW8A16Fp8, + CompressedTensorsWNA16, ) from sglang.srt.layers.quantization.compressed_tensors.utils import ( find_matched_target, diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 64401aea6a71..fb12922df3be 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -7,10 +7,6 @@ from compressed_tensors.quantization import QuantizationStrategy from torch.nn import Parameter -from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( - NPUW8A8Int8DynamicLinearMethod, - NPUW8A8Int8LinearMethod, -) from sglang.srt.layers.parameter import ( ChannelQuantScaleParameter, ModelWeightParameter, @@ -19,6 +15,10 @@ from sglang.srt.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme, ) +from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( + NPUW8A8Int8DynamicLinearMethod, + NPUW8A8Int8LinearMethod +) from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8 from sglang.srt.layers.quantization.utils import requantize_with_max_scale from sglang.srt.utils import is_cuda @@ -94,11 +94,15 @@ def create_weights( class GPUCompressedTensorsW8A8Int8(CompressedTensorsScheme): - + def __init__( self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool ): - super.__init__(strategy, is_static_input_scheme, input_symmetric) + super.__init__( + strategy, + is_static_input_scheme, + input_symmetric + ) @classmethod def get_min_capability(cls) -> int: @@ -186,11 +190,15 @@ def apply_weights( class NPUCompressedTensorsW8A8Int8(CompressedTensorsScheme): - + def __init__( self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool ): - super.__init__(strategy, is_static_input_scheme, input_symmetric) + super.__init__( + strategy, + is_static_input_scheme, + input_symmetric + ) @classmethod def get_min_capability(cls) -> int: @@ -201,9 +209,10 @@ def process_weights_after_loading(self, layer): return NPUW8A8Int8LinearMethod.process_weights_after_loading(layer) else: return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) - + def apply_weights(self, layer, x, bias): if self.is_static_input_scheme: return NPUW8A8Int8LinearMethod.apply(layer) else: return NPUW8A8Int8DynamicLinearMethod.apply(layer) + diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index 7825b3fd2027..4de4d04ec6b1 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -5,36 +5,42 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast import torch +from compressed_tensors.quantization import QuantizationStrategy +from pydantic import BaseModel # from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( # NPUW4A8Int4DynamicMoEMethod, # NPUW4A16Int4DynamicMoEMethod, # NPUW8A8Int8DynamicMoEMethod, # ) -from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( # NPUW8A8Int8DynamicLinearMethod,; NPUW8A8Int8LinearMethod, - _NPULinearMethodBase, +from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( + _NPULinearMethodBase + # NPUW8A8Int8DynamicLinearMethod, + # NPUW8A8Int8LinearMethod, ) from sglang.srt.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, ) -from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import ( ModelSlimMoEMethod, ) from sglang.srt.layers.quantization.msmodelslim.schemes import ( ModelSlimScheme, - ModelSlimW4A4Int4, ModelSlimW8A8Int8, + ModelSlimW4A4Int4, ) - -# from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer +from sglang.srt.layers.quantization.compressed_tensors.utils import ( + find_matched_target, + is_activation_quantization_format, + should_ignore_layer +) +#from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod from sglang.srt.utils import apply_module_patch logger = logging.getLogger(__name__) - # func refers to RMSNorm.__init__ def npu_wrapper_rmsnorm_init(func): def init(self, hidden_size: int, **extra_args) -> None: @@ -45,7 +51,6 @@ def init(self, hidden_size: int, **extra_args) -> None: return init - # func refers to RMSNorm.forward_oot def npu_wrapper_rmsnorm_forward(func): def _rmsnorm_forward_oot( @@ -117,7 +122,7 @@ def __init__(self, quant_config: Dict[str, Any] = {}): "forward_npu", [npu_wrapper_rmsnorm_forward], ) - + def get_linear_method(self) -> ModelSlimLinearMethod: return ModelSlimLinearMethod(self) @@ -178,28 +183,29 @@ def get_quant_method( return UnquantizedLinearMethod() scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config) if scheme is None: - raise NotImplementedError( - "At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes." - ) + raise NotImplementedError("At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes.") layer.scheme = scheme - return ModelSlimLinearMethod(self) + return ( + ModelSlimLinearMethod(self) + ) elif isinstance(layer, FusedMoE): return ModelSlimMoEMethod.get_moe_method(self, layer, prefix) return None def _get_scheme_from_parts( - self, - layer_name: str, - ) -> ModelSlimScheme: + self, layer_name: str, + ) -> ModelSlimScheme: - quant_type = self.quant_description[layer_name + ".weight"] + quant_type = self.quant_description[layer_name + '.weight'] if quant_type == "W8A8_DYNAMIC" or quant_type == "W8A8": return ModelSlimW8A8Int8( - quant_config=self.quant_description, prefix=layer_name + quant_config=self.quant_description, + prefix=layer_name ) elif quant_type == "W4A4_DYNAMIC": return ModelSlimW4A4Int4( - quant_config=self.quant_description, prefix=layer_name + quant_config=self.quant_description, + prefix=layer_name ) # Detect If Mixed Precision @@ -219,66 +225,66 @@ def _get_scheme_from_parts( # "Other method (CompressedTensorsW4A16Sparse24) is not supported now" # ) - # if is_activation_quantization_format(self.quant_format): - # if self._is_fp8_w8a8(weight_quant, input_quant): - # is_fp8_w8a8_supported = self._check_scheme_supported( - # CompressedTensorsW8A8Fp8.get_min_capability(), error=False - # ) - # if is_fp8_w8a8_supported: - # return CompressedTensorsW8A8Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=( - # input_quant and not input_quant.dynamic - # ), - # ) - # else: - # # note: input_quant will be present for converted models; - # # will be ignored during inference post loading - # return CompressedTensorsW8A16Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=not input_quant.dynamic, - # ) - - # # note: input_quant can be None - # if self._is_fp8_w8a16(weight_quant, input_quant): - # is_static_input_scheme = input_quant and not input_quant.dynamic - # return CompressedTensorsW8A16Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=is_static_input_scheme, - # ) - - # raise NotImplementedError("No msmodelslim compatible scheme was found.") - + #if is_activation_quantization_format(self.quant_format): + # if self._is_fp8_w8a8(weight_quant, input_quant): + # is_fp8_w8a8_supported = self._check_scheme_supported( + # CompressedTensorsW8A8Fp8.get_min_capability(), error=False + # ) + # if is_fp8_w8a8_supported: + # return CompressedTensorsW8A8Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=( + # input_quant and not input_quant.dynamic + # ), + # ) + # else: + # # note: input_quant will be present for converted models; + # # will be ignored during inference post loading + # return CompressedTensorsW8A16Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=not input_quant.dynamic, + # ) + + # # note: input_quant can be None + # if self._is_fp8_w8a16(weight_quant, input_quant): + # is_static_input_scheme = input_quant and not input_quant.dynamic + # return CompressedTensorsW8A16Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=is_static_input_scheme, + # ) + + #raise NotImplementedError("No msmodelslim compatible scheme was found.") + def get_scheme( - self, layer: torch.nn.Module, layer_name: Optional[str] = None - ) -> Optional[ModelSlimScheme]: - """ - get_scheme method adjusted for modelslim, taken from - python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py - """ - # if self.target_scheme_map: - # matched_target = find_matched_target( - # layer_name=layer_name, - # module=layer, - # targets=self.target_scheme_map.keys(), - # fused_mapping=self.packed_modules_mapping, - # ) + self, layer: torch.nn.Module, layer_name: Optional[str] = None + ) -> Optional[ModelSlimScheme]: + """ + get_scheme method adjusted for modelslim, taken from + python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py + """ + # if self.target_scheme_map: + # matched_target = find_matched_target( + # layer_name=layer_name, + # module=layer, + # targets=self.target_scheme_map.keys(), + # fused_mapping=self.packed_modules_mapping, + # ) - # scheme_dict = self.target_scheme_map[matched_target] - # weight_quant = scheme_dict.get("weights") - # input_quant = scheme_dict.get("input_activations") - # else: - # Find the quant_scheme - scheme = self._get_scheme_from_parts( # type: ignore - # weight_quant=weight_quant, - # input_quant=input_quant, - layer_name=layer_name, - ) + # scheme_dict = self.target_scheme_map[matched_target] + # weight_quant = scheme_dict.get("weights") + # input_quant = scheme_dict.get("input_activations") + # else: + # Find the quant_scheme + scheme = self._get_scheme_from_parts( # type: ignore + # weight_quant=weight_quant, + # input_quant=input_quant, + layer_name=layer_name, + ) - # Ascend doesn't support device capability - # self._check_scheme_supported(scheme.get_min_capability()) - logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name) - return scheme + # Ascend doesn't support device capability + # self._check_scheme_supported(scheme.get_min_capability()) + logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name) + return scheme def is_layer_skipped( self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 4b7b596c4f8d..fe83dc771117 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -2,16 +2,25 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import enum import logging -from typing import TYPE_CHECKING, Any, Dict +from enum import Enum +from typing import Callable, Optional, TYPE_CHECKING +from typing import Any, Dict, List import torch +from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo +from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase +from sglang.srt.layers.quantization.msmodelslim.schemes import ( + ModelSlimScheme, +) +from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( NPUW4A8Int8DynamicMoEMethod, NPUW8A8Int8DynamicMoEMethod, ) -from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase + from sglang.srt.utils import set_weight_attrs if TYPE_CHECKING: @@ -20,7 +29,9 @@ CombineInput, StandardDispatchOutput, ) - from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig + from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ( + ModelSlimConfig, + ) logger = logging.getLogger(__name__) @@ -62,9 +73,7 @@ def get_moe_method( class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod): def __init__( - self, - quant_config: Dict[str, Any], - prefix: str = None, + self, quant_config: Dict[str, Any], prefix: str = None, ): self.quant_config = quant_config self.group_size = 0 @@ -164,9 +173,7 @@ def create_weights( ), requires_grad=False, ) - layer.register_parameter( - "w13_weight_offset_second", w13_weight_offset_second - ) + layer.register_parameter("w13_weight_offset_second", w13_weight_offset_second) set_weight_attrs(w13_weight_offset_second, extra_weight_attrs) w2_weight_scale_second = torch.nn.Parameter( @@ -218,13 +225,14 @@ def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" ): self.moe_runner_config = moe_runner_config - + def apply( self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": return NPUW4A8Int8DynamicMoEMethod.apply(layer, dispatch_output) + def apply_without_routing_weights( self, @@ -235,22 +243,18 @@ def apply_without_routing_weights( group_list, output_dtype, ): - return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights( - layer, - hidden_states, - hidden_states_scale, - group_list_type, - group_list, - output_dtype, - ) + return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype,) class ModelSlimW8A8Int8MoE(ModelSlimMoEMethod): def __init__( - self, - quant_config: Dict[str, Any], - prefix: str = None, + self, quant_config: Dict[str, Any], prefix: str = None, ): self.quant_config = quant_config @@ -331,13 +335,14 @@ def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" ): self.moe_runner_config = moe_runner_config - + def apply( self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output) + def apply_without_routing_weights( self, @@ -348,11 +353,9 @@ def apply_without_routing_weights( group_list, output_dtype, ): - return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights( - layer, - hidden_states, - hidden_states_scale, - group_list_type, - group_list, - output_dtype, - ) + return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype,) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py index fba516eed7c0..997892772977 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 from .msmodelslim_scheme import ModelSlimScheme -from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4 from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8 +from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4 __all__ = [ "ModelSlimScheme", diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py index 1b578837c8d4..3bbbf4af1f2d 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py @@ -1,28 +1,40 @@ # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Callable, Optional import torch +from torch.nn import Parameter +from typing import Any, Dict, List + +from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( NPU_W4A4DynamicLinearMethod, ) -from sglang.srt.layers.parameter import PerTensorScaleParameter -from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme +from sglang.srt.layers.parameter import ( + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) +from sglang.srt.layers.quantization.msmodelslim.schemes import ( + ModelSlimScheme, +) + from sglang.srt.utils import set_weight_attrs class ModelSlimW4A4Int4(ModelSlimScheme): def __init__( - self, - quant_config: Dict[str, any], - prefix: str, + self, quant_config: Dict[str, any], prefix: str, ): self.quant_config = quant_config - self.is_dynamic = self.quant_config[prefix + ".weight"] == "W4A4_DYNAMIC" - + self.is_dynamic = ( + self.quant_config[prefix + ".weight"] + == "W4A4_DYNAMIC" + ) + @staticmethod def get_weight( input_size: int, output_size: int, params_dtype: torch.dtype @@ -52,12 +64,8 @@ def create_weights( ) -> None: output_size_per_partition = sum(output_partition_sizes) weight_loader = extra_weight_attrs.get("weight_loader") - - weight_dict = { - "weight": torch.empty( - output_size_per_partition, input_size_per_partition, dtype=torch.int8 - ) - } + + weight_dict = {"weight": torch.empty(output_size_per_partition, input_size_per_partition, dtype=torch.int8)} for weight_name, weight_param in weight_dict.items(): param = torch.nn.Parameter(weight_param, requires_grad=False) set_weight_attrs(param, {"input_dim": 1, "output_dim": 0}) @@ -72,14 +80,10 @@ def create_weights( # disable warning param.ignore_warning = True layer.register_parameter(pertensor_name, param) - + perchannel_dict = {} - perchannel_dict["weight_scale"] = torch.empty( - output_size_per_partition, 1, dtype=params_dtype - ) - perchannel_dict["weight_offset"] = torch.empty( - output_size_per_partition, 1, dtype=params_dtype - ) + perchannel_dict["weight_scale"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype) + perchannel_dict["weight_offset"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype) for perchannel_name, perchannel_param in perchannel_dict.items(): param = torch.nn.Parameter(perchannel_param, requires_grad=False) set_weight_attrs(param, {"output_dim": 0}) @@ -88,7 +92,7 @@ def create_weights( def process_weights_after_loading(self, layer): NPU_W4A4DynamicLinearMethod.process_weights_after_loading(layer) - + def apply_weights( self, layer: torch.nn.Module, diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py index de99c9fed0b7..b33764b858a9 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py @@ -1,31 +1,38 @@ # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Optional +from typing import Callable, Optional import torch +from torch.nn import Parameter +from typing import Any, Dict, List + +from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( NPUW8A8Int8DynamicLinearMethod, - NPUW8A8Int8LinearMethod, + NPUW8A8Int8LinearMethod ) from sglang.srt.layers.parameter import ( ChannelQuantScaleParameter, ModelWeightParameter, PerTensorScaleParameter, ) -from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme +from sglang.srt.layers.quantization.msmodelslim.schemes import ( + ModelSlimScheme, +) class ModelSlimW8A8Int8(ModelSlimScheme): def __init__( - self, - quant_config: Dict[str, any], - prefix: str, + self, quant_config: Dict[str, any], prefix: str, ): self.quant_config = quant_config - self.is_dynamic = self.quant_config[prefix + ".weight"] == "W8A8_DYNAMIC" + self.is_dynamic = ( + self.quant_config[prefix + ".weight"] + == "W8A8_DYNAMIC" + ) def create_weights( self, @@ -63,7 +70,7 @@ def create_weights( weight_loader=weight_loader, ) layer.register_parameter("weight_offset", weight_offset) - + if not self.is_dynamic: input_scale = PerTensorScaleParameter( data=torch.empty(1, dtype=params_dtype), @@ -104,7 +111,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module): NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) else: NPUW8A8Int8LinearMethod.process_weights_after_loading(layer) - + def apply_weights( self, layer: torch.nn.Module, @@ -114,4 +121,4 @@ def apply_weights( if self.is_dynamic: return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias) else: - return NPUW8A8Int8LinearMethod.apply(layer, x, bias) + return NPUW8A8Int8LinearMethod.apply(layer, x, bias) \ No newline at end of file diff --git a/test/manual/ascend/test_ascend_w8a8_quantization.py b/test/manual/ascend/test_ascend_w8a8_quantization.py index 959bf88a513f..5c1bc3b66562 100644 --- a/test/manual/ascend/test_ascend_w8a8_quantization.py +++ b/test/manual/ascend/test_ascend_w8a8_quantization.py @@ -98,12 +98,12 @@ def test_throughput(self): if is_in_ci(): self.assertGreaterEqual(throughput, 25) - + class TestAscendW8A8CompressedTensors(CustomTestCase): @classmethod def setUpClass(cls): - # TODO: Move model to CI or Modelscope + #TODO: Move model to CI or Modelscope cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8" cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( From df01a40b54829751c2e71a619989f28dd8b05fa2 Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Wed, 17 Dec 2025 13:30:34 +0300 Subject: [PATCH 036/175] Pre-commit fixes --- python/sglang/srt/configs/model_config.py | 17 +- .../npu/quantization/fused_moe_method_npu.py | 20 ++- .../npu/quantization/linear_method_npu.py | 11 +- .../compressed_tensors/compressed_tensors.py | 4 +- .../schemes/compressed_tensors_w8a8_int8.py | 27 +-- .../quantization/msmodelslim/msmodelslim.py | 158 +++++++++--------- .../msmodelslim/msmodelslim_moe.py | 63 ++++--- .../msmodelslim/schemes/__init__.py | 2 +- .../schemes/msmodelslim_w4a4_int4.py | 48 +++--- .../schemes/msmodelslim_w8a8_int8.py | 27 ++- .../ascend/test_ascend_w8a8_quantization.py | 4 +- 11 files changed, 175 insertions(+), 206 deletions(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index a4253b9f59ce..8e7e98a77d0b 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -16,8 +16,8 @@ import logging import math import os -from pathlib import Path from enum import Enum, IntEnum, auto +from pathlib import Path from typing import Any, List, Optional, Set, Union import torch @@ -26,7 +26,7 @@ from sglang.srt.environ import envs from sglang.srt.layers.quantization import QUANTIZATION_METHODS from sglang.srt.server_args import ServerArgs -from sglang.srt.utils import is_hip, retry, is_npu +from sglang.srt.utils import is_hip, is_npu, retry from sglang.srt.utils.hf_transformers_utils import ( get_config, get_context_length, @@ -39,6 +39,7 @@ logger = logging.getLogger(__name__) _is_npu = is_npu() + class AttentionArch(IntEnum): MLA = auto() MHA = auto() @@ -596,12 +597,12 @@ def _parse_quant_hf_config(self): return quant_cfg def _find_quant_modelslim_config(self): - quant_config_file = Path(self.model_path, "quant_model_description.json") - quant_cfg = None - if quant_config_file.is_file(): - with open(quant_config_file) as f: + quant_config_file = Path(self.model_path, "quant_model_description.json") + quant_cfg = None + if quant_config_file.is_file(): + with open(quant_config_file) as f: quant_cfg = json.load(f) - + return quant_cfg def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]: @@ -724,7 +725,7 @@ def _verify_quantization(self) -> None: quant_cfg = self._parse_quant_hf_config() if _is_npu: quant_cfg = self._find_quant_modelslim_config() - self.quantization = 'modelslim' + self.quantization = "modelslim" if quant_cfg is not None: quant_method = quant_cfg.get( diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 54622a1e0873..dfbab790d1ed 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -199,7 +199,7 @@ def apply( top_k=topk_ids.shape[1], ) return StandardCombineInput(hidden_states=output) - + @staticmethod def apply_without_routing_weights( layer, @@ -253,12 +253,11 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase): @classmethod def process_scale(cls, weight: torch.Tensor, scale, per_group_scale): scale = scale.transpose(1, 2).contiguous() - #if cls.is_per_channel_weight: + # if cls.is_per_channel_weight: if True: scale_np = scale.cpu().numpy() scale_np.dtype = np.uint32 - scale_uint64_tensor = torch.from_numpy(scale_np.astype( - np.int64)).npu() + scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu() return scale_uint64_tensor, None per_group_scale = per_group_scale.transpose(1, 2).contiguous() group_num, k, n = weight.shape @@ -341,6 +340,7 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data) staticmethod + def apply( layer, dispatch_output: "StandardDispatchOutput", @@ -351,11 +351,11 @@ def apply( topk_output = dispatch_output.topk_output topk_weights, topk_ids, _ = topk_output - top_k=topk_ids.shape[1] + top_k = topk_ids.shape[1] group_list_type = 1 original_shape = hidden_states.shape topk_weights = topk_weights - + num_tokens = hidden_states.shape[:-1].numel() first_expert_idx = 0 @@ -372,7 +372,8 @@ def apply( expert_tokens_num_flag=True, active_expert_range=[first_expert_idx, last_expert_idx], quant_mode=1, - )) + ) + ) expert_tokens = expert_tokens.to(torch.int64) @@ -382,7 +383,7 @@ def apply( w2_scale = [layer.w2_weight_scale] # TODO w4a8 scene: dynamic acquisition of dtype in the future _output_dtype = torch.bfloat16 - + hidden_states = torch.ops.npu.npu_grouped_matmul( x=[sorted_hidden_states], weight=[layer.w13_weight], @@ -417,7 +418,8 @@ def apply( final_hidden_states = torch.ops.npu.npu_moe_token_unpermute( permuted_tokens=output, sorted_indices=torch.abs(expanded_row_idx), - probs=topk_weights) + probs=topk_weights, + ) if len(original_shape) == 3: final_hidden_states = final_hidden_states.view(original_shape) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py index 6481b4f79bf4..681d45d18f0b 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py @@ -1,13 +1,8 @@ -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Optional import torch from sglang.srt.hardware_backend.npu.utils import npu_format_cast -from sglang.srt.layers.parameter import ( - ChannelQuantScaleParameter, - ModelWeightParameter, - PerTensorScaleParameter, -) from sglang.srt.layers.quantization.base_config import LinearMethodBase if TYPE_CHECKING: @@ -25,7 +20,7 @@ def __init__( class NPUW8A8Int8LinearMethod(_NPULinearMethodBase): - + @staticmethod def apply( layer: torch.nn.Module, @@ -139,4 +134,4 @@ def process_weights_after_loading(layer): layer.weight_offset.data = layer.weight_offset.data.flatten() layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack( layer.weight.data.to(torch.int32) - ) \ No newline at end of file + ) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index 864bc91cc838..4e10c5d734eb 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -32,10 +32,10 @@ CompressedTensorsScheme, CompressedTensorsW4A4Fp4, CompressedTensorsW8A8Fp8, - GPUCompressedTensorsW8A8Int8, - NPUCompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16, + GPUCompressedTensorsW8A8Int8, + NPUCompressedTensorsW8A8Int8, ) from sglang.srt.layers.quantization.compressed_tensors.utils import ( find_matched_target, diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index fb12922df3be..64401aea6a71 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -7,6 +7,10 @@ from compressed_tensors.quantization import QuantizationStrategy from torch.nn import Parameter +from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( + NPUW8A8Int8DynamicLinearMethod, + NPUW8A8Int8LinearMethod, +) from sglang.srt.layers.parameter import ( ChannelQuantScaleParameter, ModelWeightParameter, @@ -15,10 +19,6 @@ from sglang.srt.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme, ) -from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( - NPUW8A8Int8DynamicLinearMethod, - NPUW8A8Int8LinearMethod -) from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8 from sglang.srt.layers.quantization.utils import requantize_with_max_scale from sglang.srt.utils import is_cuda @@ -94,15 +94,11 @@ def create_weights( class GPUCompressedTensorsW8A8Int8(CompressedTensorsScheme): - + def __init__( self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool ): - super.__init__( - strategy, - is_static_input_scheme, - input_symmetric - ) + super.__init__(strategy, is_static_input_scheme, input_symmetric) @classmethod def get_min_capability(cls) -> int: @@ -190,15 +186,11 @@ def apply_weights( class NPUCompressedTensorsW8A8Int8(CompressedTensorsScheme): - + def __init__( self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool ): - super.__init__( - strategy, - is_static_input_scheme, - input_symmetric - ) + super.__init__(strategy, is_static_input_scheme, input_symmetric) @classmethod def get_min_capability(cls) -> int: @@ -209,10 +201,9 @@ def process_weights_after_loading(self, layer): return NPUW8A8Int8LinearMethod.process_weights_after_loading(layer) else: return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) - + def apply_weights(self, layer, x, bias): if self.is_static_input_scheme: return NPUW8A8Int8LinearMethod.apply(layer) else: return NPUW8A8Int8DynamicLinearMethod.apply(layer) - diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index 4de4d04ec6b1..7825b3fd2027 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -5,42 +5,36 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast import torch -from compressed_tensors.quantization import QuantizationStrategy -from pydantic import BaseModel # from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( # NPUW4A8Int4DynamicMoEMethod, # NPUW4A16Int4DynamicMoEMethod, # NPUW8A8Int8DynamicMoEMethod, # ) -from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( - _NPULinearMethodBase - # NPUW8A8Int8DynamicLinearMethod, - # NPUW8A8Int8LinearMethod, +from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( # NPUW8A8Int8DynamicLinearMethod,; NPUW8A8Int8LinearMethod, + _NPULinearMethodBase, ) from sglang.srt.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, ) +from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import ( ModelSlimMoEMethod, ) from sglang.srt.layers.quantization.msmodelslim.schemes import ( ModelSlimScheme, - ModelSlimW8A8Int8, ModelSlimW4A4Int4, + ModelSlimW8A8Int8, ) -from sglang.srt.layers.quantization.compressed_tensors.utils import ( - find_matched_target, - is_activation_quantization_format, - should_ignore_layer -) -#from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer + +# from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod from sglang.srt.utils import apply_module_patch logger = logging.getLogger(__name__) + # func refers to RMSNorm.__init__ def npu_wrapper_rmsnorm_init(func): def init(self, hidden_size: int, **extra_args) -> None: @@ -51,6 +45,7 @@ def init(self, hidden_size: int, **extra_args) -> None: return init + # func refers to RMSNorm.forward_oot def npu_wrapper_rmsnorm_forward(func): def _rmsnorm_forward_oot( @@ -122,7 +117,7 @@ def __init__(self, quant_config: Dict[str, Any] = {}): "forward_npu", [npu_wrapper_rmsnorm_forward], ) - + def get_linear_method(self) -> ModelSlimLinearMethod: return ModelSlimLinearMethod(self) @@ -183,29 +178,28 @@ def get_quant_method( return UnquantizedLinearMethod() scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config) if scheme is None: - raise NotImplementedError("At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes.") + raise NotImplementedError( + "At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes." + ) layer.scheme = scheme - return ( - ModelSlimLinearMethod(self) - ) + return ModelSlimLinearMethod(self) elif isinstance(layer, FusedMoE): return ModelSlimMoEMethod.get_moe_method(self, layer, prefix) return None def _get_scheme_from_parts( - self, layer_name: str, - ) -> ModelSlimScheme: + self, + layer_name: str, + ) -> ModelSlimScheme: - quant_type = self.quant_description[layer_name + '.weight'] + quant_type = self.quant_description[layer_name + ".weight"] if quant_type == "W8A8_DYNAMIC" or quant_type == "W8A8": return ModelSlimW8A8Int8( - quant_config=self.quant_description, - prefix=layer_name + quant_config=self.quant_description, prefix=layer_name ) elif quant_type == "W4A4_DYNAMIC": return ModelSlimW4A4Int4( - quant_config=self.quant_description, - prefix=layer_name + quant_config=self.quant_description, prefix=layer_name ) # Detect If Mixed Precision @@ -225,66 +219,66 @@ def _get_scheme_from_parts( # "Other method (CompressedTensorsW4A16Sparse24) is not supported now" # ) - #if is_activation_quantization_format(self.quant_format): - # if self._is_fp8_w8a8(weight_quant, input_quant): - # is_fp8_w8a8_supported = self._check_scheme_supported( - # CompressedTensorsW8A8Fp8.get_min_capability(), error=False - # ) - # if is_fp8_w8a8_supported: - # return CompressedTensorsW8A8Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=( - # input_quant and not input_quant.dynamic - # ), - # ) - # else: - # # note: input_quant will be present for converted models; - # # will be ignored during inference post loading - # return CompressedTensorsW8A16Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=not input_quant.dynamic, - # ) - - # # note: input_quant can be None - # if self._is_fp8_w8a16(weight_quant, input_quant): - # is_static_input_scheme = input_quant and not input_quant.dynamic - # return CompressedTensorsW8A16Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=is_static_input_scheme, - # ) - - #raise NotImplementedError("No msmodelslim compatible scheme was found.") - - def get_scheme( - self, layer: torch.nn.Module, layer_name: Optional[str] = None - ) -> Optional[ModelSlimScheme]: - """ - get_scheme method adjusted for modelslim, taken from - python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py - """ - # if self.target_scheme_map: - # matched_target = find_matched_target( - # layer_name=layer_name, - # module=layer, - # targets=self.target_scheme_map.keys(), - # fused_mapping=self.packed_modules_mapping, + # if is_activation_quantization_format(self.quant_format): + # if self._is_fp8_w8a8(weight_quant, input_quant): + # is_fp8_w8a8_supported = self._check_scheme_supported( + # CompressedTensorsW8A8Fp8.get_min_capability(), error=False # ) + # if is_fp8_w8a8_supported: + # return CompressedTensorsW8A8Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=( + # input_quant and not input_quant.dynamic + # ), + # ) + # else: + # # note: input_quant will be present for converted models; + # # will be ignored during inference post loading + # return CompressedTensorsW8A16Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=not input_quant.dynamic, + # ) - # scheme_dict = self.target_scheme_map[matched_target] - # weight_quant = scheme_dict.get("weights") - # input_quant = scheme_dict.get("input_activations") - # else: - # Find the quant_scheme - scheme = self._get_scheme_from_parts( # type: ignore - # weight_quant=weight_quant, - # input_quant=input_quant, - layer_name=layer_name, - ) + # # note: input_quant can be None + # if self._is_fp8_w8a16(weight_quant, input_quant): + # is_static_input_scheme = input_quant and not input_quant.dynamic + # return CompressedTensorsW8A16Fp8( + # strategy=weight_quant.strategy, + # is_static_input_scheme=is_static_input_scheme, + # ) + + # raise NotImplementedError("No msmodelslim compatible scheme was found.") + + def get_scheme( + self, layer: torch.nn.Module, layer_name: Optional[str] = None + ) -> Optional[ModelSlimScheme]: + """ + get_scheme method adjusted for modelslim, taken from + python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py + """ + # if self.target_scheme_map: + # matched_target = find_matched_target( + # layer_name=layer_name, + # module=layer, + # targets=self.target_scheme_map.keys(), + # fused_mapping=self.packed_modules_mapping, + # ) + + # scheme_dict = self.target_scheme_map[matched_target] + # weight_quant = scheme_dict.get("weights") + # input_quant = scheme_dict.get("input_activations") + # else: + # Find the quant_scheme + scheme = self._get_scheme_from_parts( # type: ignore + # weight_quant=weight_quant, + # input_quant=input_quant, + layer_name=layer_name, + ) - # Ascend doesn't support device capability - # self._check_scheme_supported(scheme.get_min_capability()) - logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name) - return scheme + # Ascend doesn't support device capability + # self._check_scheme_supported(scheme.get_min_capability()) + logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name) + return scheme def is_layer_skipped( self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index fe83dc771117..4b7b596c4f8d 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -2,25 +2,16 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import enum import logging -from enum import Enum -from typing import Callable, Optional, TYPE_CHECKING -from typing import Any, Dict, List +from typing import TYPE_CHECKING, Any, Dict import torch -from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo -from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase -from sglang.srt.layers.quantization.msmodelslim.schemes import ( - ModelSlimScheme, -) -from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( NPUW4A8Int8DynamicMoEMethod, NPUW8A8Int8DynamicMoEMethod, ) - +from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase from sglang.srt.utils import set_weight_attrs if TYPE_CHECKING: @@ -29,9 +20,7 @@ CombineInput, StandardDispatchOutput, ) - from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ( - ModelSlimConfig, - ) + from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig logger = logging.getLogger(__name__) @@ -73,7 +62,9 @@ def get_moe_method( class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod): def __init__( - self, quant_config: Dict[str, Any], prefix: str = None, + self, + quant_config: Dict[str, Any], + prefix: str = None, ): self.quant_config = quant_config self.group_size = 0 @@ -173,7 +164,9 @@ def create_weights( ), requires_grad=False, ) - layer.register_parameter("w13_weight_offset_second", w13_weight_offset_second) + layer.register_parameter( + "w13_weight_offset_second", w13_weight_offset_second + ) set_weight_attrs(w13_weight_offset_second, extra_weight_attrs) w2_weight_scale_second = torch.nn.Parameter( @@ -225,14 +218,13 @@ def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" ): self.moe_runner_config = moe_runner_config - + def apply( self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": return NPUW4A8Int8DynamicMoEMethod.apply(layer, dispatch_output) - def apply_without_routing_weights( self, @@ -243,18 +235,22 @@ def apply_without_routing_weights( group_list, output_dtype, ): - return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(layer, - hidden_states, - hidden_states_scale, - group_list_type, - group_list, - output_dtype,) + return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights( + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ) class ModelSlimW8A8Int8MoE(ModelSlimMoEMethod): def __init__( - self, quant_config: Dict[str, Any], prefix: str = None, + self, + quant_config: Dict[str, Any], + prefix: str = None, ): self.quant_config = quant_config @@ -335,14 +331,13 @@ def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" ): self.moe_runner_config = moe_runner_config - + def apply( self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output) - def apply_without_routing_weights( self, @@ -353,9 +348,11 @@ def apply_without_routing_weights( group_list, output_dtype, ): - return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(layer, - hidden_states, - hidden_states_scale, - group_list_type, - group_list, - output_dtype,) + return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights( + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py index 997892772977..fba516eed7c0 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 from .msmodelslim_scheme import ModelSlimScheme -from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8 from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4 +from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8 __all__ = [ "ModelSlimScheme", diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py index 3bbbf4af1f2d..1b578837c8d4 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py @@ -1,40 +1,28 @@ # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Any, Dict, List, Optional import torch -from torch.nn import Parameter -from typing import Any, Dict, List - -from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( NPU_W4A4DynamicLinearMethod, ) -from sglang.srt.layers.parameter import ( - ChannelQuantScaleParameter, - ModelWeightParameter, - PerTensorScaleParameter, -) -from sglang.srt.layers.quantization.msmodelslim.schemes import ( - ModelSlimScheme, -) - +from sglang.srt.layers.parameter import PerTensorScaleParameter +from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme from sglang.srt.utils import set_weight_attrs class ModelSlimW4A4Int4(ModelSlimScheme): def __init__( - self, quant_config: Dict[str, any], prefix: str, + self, + quant_config: Dict[str, any], + prefix: str, ): self.quant_config = quant_config - self.is_dynamic = ( - self.quant_config[prefix + ".weight"] - == "W4A4_DYNAMIC" - ) - + self.is_dynamic = self.quant_config[prefix + ".weight"] == "W4A4_DYNAMIC" + @staticmethod def get_weight( input_size: int, output_size: int, params_dtype: torch.dtype @@ -64,8 +52,12 @@ def create_weights( ) -> None: output_size_per_partition = sum(output_partition_sizes) weight_loader = extra_weight_attrs.get("weight_loader") - - weight_dict = {"weight": torch.empty(output_size_per_partition, input_size_per_partition, dtype=torch.int8)} + + weight_dict = { + "weight": torch.empty( + output_size_per_partition, input_size_per_partition, dtype=torch.int8 + ) + } for weight_name, weight_param in weight_dict.items(): param = torch.nn.Parameter(weight_param, requires_grad=False) set_weight_attrs(param, {"input_dim": 1, "output_dim": 0}) @@ -80,10 +72,14 @@ def create_weights( # disable warning param.ignore_warning = True layer.register_parameter(pertensor_name, param) - + perchannel_dict = {} - perchannel_dict["weight_scale"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype) - perchannel_dict["weight_offset"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype) + perchannel_dict["weight_scale"] = torch.empty( + output_size_per_partition, 1, dtype=params_dtype + ) + perchannel_dict["weight_offset"] = torch.empty( + output_size_per_partition, 1, dtype=params_dtype + ) for perchannel_name, perchannel_param in perchannel_dict.items(): param = torch.nn.Parameter(perchannel_param, requires_grad=False) set_weight_attrs(param, {"output_dim": 0}) @@ -92,7 +88,7 @@ def create_weights( def process_weights_after_loading(self, layer): NPU_W4A4DynamicLinearMethod.process_weights_after_loading(layer) - + def apply_weights( self, layer: torch.nn.Module, diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py index b33764b858a9..de99c9fed0b7 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py @@ -1,38 +1,31 @@ # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Dict, List, Optional import torch -from torch.nn import Parameter -from typing import Any, Dict, List - -from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( NPUW8A8Int8DynamicLinearMethod, - NPUW8A8Int8LinearMethod + NPUW8A8Int8LinearMethod, ) from sglang.srt.layers.parameter import ( ChannelQuantScaleParameter, ModelWeightParameter, PerTensorScaleParameter, ) -from sglang.srt.layers.quantization.msmodelslim.schemes import ( - ModelSlimScheme, -) +from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme class ModelSlimW8A8Int8(ModelSlimScheme): def __init__( - self, quant_config: Dict[str, any], prefix: str, + self, + quant_config: Dict[str, any], + prefix: str, ): self.quant_config = quant_config - self.is_dynamic = ( - self.quant_config[prefix + ".weight"] - == "W8A8_DYNAMIC" - ) + self.is_dynamic = self.quant_config[prefix + ".weight"] == "W8A8_DYNAMIC" def create_weights( self, @@ -70,7 +63,7 @@ def create_weights( weight_loader=weight_loader, ) layer.register_parameter("weight_offset", weight_offset) - + if not self.is_dynamic: input_scale = PerTensorScaleParameter( data=torch.empty(1, dtype=params_dtype), @@ -111,7 +104,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module): NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) else: NPUW8A8Int8LinearMethod.process_weights_after_loading(layer) - + def apply_weights( self, layer: torch.nn.Module, @@ -121,4 +114,4 @@ def apply_weights( if self.is_dynamic: return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias) else: - return NPUW8A8Int8LinearMethod.apply(layer, x, bias) \ No newline at end of file + return NPUW8A8Int8LinearMethod.apply(layer, x, bias) diff --git a/test/manual/ascend/test_ascend_w8a8_quantization.py b/test/manual/ascend/test_ascend_w8a8_quantization.py index 5c1bc3b66562..959bf88a513f 100644 --- a/test/manual/ascend/test_ascend_w8a8_quantization.py +++ b/test/manual/ascend/test_ascend_w8a8_quantization.py @@ -98,12 +98,12 @@ def test_throughput(self): if is_in_ci(): self.assertGreaterEqual(throughput, 25) - + class TestAscendW8A8CompressedTensors(CustomTestCase): @classmethod def setUpClass(cls): - #TODO: Move model to CI or Modelscope + # TODO: Move model to CI or Modelscope cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8" cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( From a16b69e21dacbae5d371a14e1ec144ce23d85e03 Mon Sep 17 00:00:00 2001 From: TamirBaydasov Date: Wed, 17 Dec 2025 14:35:42 +0300 Subject: [PATCH 037/175] Fix model config loading, add NPU w8a8int8 MoE for compressed-tensors, fix for w8a8int8 linear schemes --- python/sglang/srt/configs/model_config.py | 19 ++- .../npu/quantization/fused_moe_method_npu.py | 15 ++- .../npu/quantization/linear_method_npu.py | 8 +- .../compressed_tensors/compressed_tensors.py | 4 +- .../compressed_tensors_moe.py | 127 +++++++++++++++++- .../schemes/compressed_tensors_w8a8_int8.py | 24 ++-- 6 files changed, 170 insertions(+), 27 deletions(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 8e7e98a77d0b..1d1f81c87aa8 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -602,6 +602,9 @@ def _find_quant_modelslim_config(self): if quant_config_file.is_file(): with open(quant_config_file) as f: quant_cfg = json.load(f) + # This field is required for flagless model loading but is not present in + # modelslim model description, so we're adding it here manually. + quant_cfg['quant_method'] = 'modelslim' return quant_cfg @@ -721,11 +724,17 @@ def _verify_quantization(self) -> None: if self.quantization is not None: self.quantization = self.quantization.lower() - # Parse quantization method from the HF model config, if available. - quant_cfg = self._parse_quant_hf_config() - if _is_npu: - quant_cfg = self._find_quant_modelslim_config() - self.quantization = "modelslim" + # Parse quantization method from the HF and ModelSlim model config, if available. + # Only one function should return config, other should return None. + cfg_list = [] + cfg_list.append(self._parse_quant_hf_config) + cfg_list.append(self._find_quant_modelslim_config) + + # Filter out None values + cfg_list = [item for item in cfg_list if item is not None] + assert (len(cfg_list) == 1), "Config list contains configs from 2 methods, must be only 1" + + quant_cfg = cfg_list[0] if quant_cfg is not None: quant_method = quant_cfg.get( diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index dfbab790d1ed..db00f47f90d5 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -165,12 +165,15 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: layer.w2_weight_scale = torch.nn.Parameter( layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False ) - layer.w13_weight_offset = torch.nn.Parameter( - layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False - ) - layer.w2_weight_offset = torch.nn.Parameter( - layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False - ) + # Compressed-tensors format doesn't have this field + if hasattr(layer, "w13_weight_offset"): + layer.w13_weight_offset = torch.nn.Parameter( + layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + ) + if hasattr(layer, "w2_weight_offset"): + layer.w2_weight_offset = torch.nn.Parameter( + layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + ) layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py index 681d45d18f0b..2d70834caf0b 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py @@ -59,7 +59,9 @@ def process_weights_after_loading(layer: torch.nn.Module): layer.weight.data = npu_format_cast(layer.weight.data) layer.weight_scale.data = layer.weight_scale.data.flatten() - layer.weight_offset.data = layer.weight_offset.data.flatten() + # Compressed-tensors format doesn't have this field + if hasattr(layer, "weight_offset"): + layer.weight_offset.data = layer.weight_offset.data.flatten() expanding_factor = layer.weight.data.shape[0] layer.aclnn_input_scale = torch.nn.Parameter( @@ -101,7 +103,9 @@ def process_weights_after_loading(layer: torch.nn.Module): layer.weight.data = npu_format_cast(layer.weight.data) layer.weight_scale.data = layer.weight_scale.data.flatten() - layer.weight_offset.data = layer.weight_offset.data.flatten() + # Compressed-tensors format doesn't have this field + if hasattr(layer, "weight_offset"): + layer.weight_offset.data = layer.weight_offset.data.flatten() class NPU_W4A4DynamicLinearMethod(_NPULinearMethodBase): diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index 4e10c5d734eb..f97854839b2c 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -588,7 +588,9 @@ def get_scheme( # Raise error if device does not support the scheme # (e.g. fp8 needs ada lovelace) - self._check_scheme_supported(scheme.get_min_capability()) + # Note: NPU devices do not support min_capability function + if not _is_npu: + self._check_scheme_supported(scheme.get_min_capability()) logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name) return scheme diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index b5e3964c85f4..3ec4a45f43f9 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -15,6 +15,9 @@ from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo +from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( + NPUW8A8Int8DynamicMoEMethod, +) from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase from sglang.srt.layers.quantization.compressed_tensors.schemes import ( WNA16_SUPPORTED_BITS, @@ -32,7 +35,7 @@ replace_parameter, swizzle_blockscale, ) -from sglang.srt.utils import get_bool_env_var, is_cuda, is_hip, set_weight_attrs +from sglang.srt.utils import get_bool_env_var, is_cuda, is_npu, is_hip, set_weight_attrs if TYPE_CHECKING: from sglang.srt.layers.moe.fused_moe_triton import FusedMoE @@ -45,6 +48,7 @@ ) _is_hip = is_hip() +_is_npu = is_npu() _is_cuda = is_cuda() _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip @@ -67,6 +71,7 @@ class GPTQMarlinState(Enum): "CompressedTensorsMoEMethod", "CompressedTensorsW4A4Nvfp4MoEMethod", "CompressedTensorsW8A8Fp8MoEMethod", + "NPUCompressedTensorsW8A8Int8MoEMethod" "CompressedTensorsWNA16MoEMethod", ] @@ -98,6 +103,12 @@ def get_moe_method( elif quant_config._is_fp8_w8a8(weight_quant, input_quant): logger.info_once("Using CompressedTensorsW8A8Fp8MoEMethod") return CompressedTensorsW8A8Fp8MoEMethod(quant_config) + elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant): + if _is_npu: + logger.info_once("Using NPUCompressedTensorsW8A8Int8MoEMethod") + return NPUCompressedTensorsW8A8Int8MoEMethod(quant_config) + else: + raise NotImplementedError(f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now.") else: raise RuntimeError( f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}" @@ -681,6 +692,120 @@ def apply( return self.runner.run(dispatch_output, quant_info) +class NPUCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): + + def __init__(self, quant_config: CompressedTensorsConfig): + self.quant_config = quant_config + self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights") + self.input_quant = self.quant_config.target_scheme_map["Linear"].get( + "input_activations" + ) + if not _is_npu: + raise NotImplementedError( + "w8a8 int8 compressed tensors moe scheme is supported only for Ascend device for now." + ) + self.static_input_scales = not self.input_quant.dynamic + per_channel = ( + self.weight_quant.strategy == QuantizationStrategy.CHANNEL + and self.input_quant.strategy == QuantizationStrategy.TOKEN + ) + if not per_channel: + raise ValueError( + "For INT8 Fused MoE layers, we require channelwise, " + "dynamic per token quantization. Found " + f"{self.weight_quant}, {self.input_quant}" + ) + + self.static_input_scales = not self.input_quant.dynamic + if self.static_input_scales: + raise ValueError( + "For INT8 Fused MoE layers, we require channelwise, " + "dynamic per token quantization. Found static input scales." + ) + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + + from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported + + params_dtype = torch.int8 + + # WEIGHTS + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # WEIGHT_SCALES + assert self.weight_quant.strategy == QuantizationStrategy.CHANNEL + w13_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + w2_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, hidden_size, 1, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} + ) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # INPUT_SCALES + assert not self.static_input_scales + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + NPUW8A8Int8DynamicMoEMethod.process_weights_after_loading(layer) + + def create_moe_runner( + self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig + ): + self.moe_runner_config = moe_runner_config + self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config) + + def apply( + self, + layer: torch.nn.Module, + dispatch_output: StandardDispatchOutput, + ) -> CombineInput: + + return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output) + + class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): def __init__(self, quant_config: CompressedTensorsConfig, num_gpu_experts=-1): diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 64401aea6a71..73c24aec92e5 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -93,12 +93,12 @@ def create_weights( layer.register_parameter("input_zero_point", input_zero_point) -class GPUCompressedTensorsW8A8Int8(CompressedTensorsScheme): +class GPUCompressedTensorsW8A8Int8(CompressedTensorsW8A8Int8): def __init__( self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool ): - super.__init__(strategy, is_static_input_scheme, input_symmetric) + super().__init__(strategy, is_static_input_scheme, input_symmetric) @classmethod def get_min_capability(cls) -> int: @@ -185,25 +185,25 @@ def apply_weights( ) -class NPUCompressedTensorsW8A8Int8(CompressedTensorsScheme): +class NPUCompressedTensorsW8A8Int8(CompressedTensorsW8A8Int8): def __init__( self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool ): - super.__init__(strategy, is_static_input_scheme, input_symmetric) + super().__init__(strategy, is_static_input_scheme, input_symmetric) + # TODO: Currently, NPU kernel for static quant requires quant_bias field, + # which can't be replicated in compressed-tensors. + if self.is_static_input_scheme: + raise NotImplementedError( + "Static compressed-tensors scheme is not yet supported on NPU." + ) @classmethod def get_min_capability(cls) -> int: return NotImplementedError def process_weights_after_loading(self, layer): - if self.is_static_input_scheme: - return NPUW8A8Int8LinearMethod.process_weights_after_loading(layer) - else: - return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) + return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) def apply_weights(self, layer, x, bias): - if self.is_static_input_scheme: - return NPUW8A8Int8LinearMethod.apply(layer) - else: - return NPUW8A8Int8DynamicLinearMethod.apply(layer) + return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias) From 238759c5fa873af55af3bd370a69fc8cd94f0a8d Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Wed, 17 Dec 2025 14:45:40 +0300 Subject: [PATCH 038/175] Pre-commit fixes --- python/sglang/srt/configs/model_config.py | 8 +++++--- .../npu/quantization/fused_moe_method_npu.py | 6 ++++-- .../compressed_tensors/compressed_tensors_moe.py | 15 ++++++++------- .../schemes/compressed_tensors_w8a8_int8.py | 1 - 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 1d1f81c87aa8..5e5eba62a295 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -604,7 +604,7 @@ def _find_quant_modelslim_config(self): quant_cfg = json.load(f) # This field is required for flagless model loading but is not present in # modelslim model description, so we're adding it here manually. - quant_cfg['quant_method'] = 'modelslim' + quant_cfg["quant_method"] = "modelslim" return quant_cfg @@ -732,8 +732,10 @@ def _verify_quantization(self) -> None: # Filter out None values cfg_list = [item for item in cfg_list if item is not None] - assert (len(cfg_list) == 1), "Config list contains configs from 2 methods, must be only 1" - + assert ( + len(cfg_list) == 1 + ), "Config list contains configs from 2 methods, must be only 1" + quant_cfg = cfg_list[0] if quant_cfg is not None: diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index db00f47f90d5..71ab140c1f02 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -168,11 +168,13 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: # Compressed-tensors format doesn't have this field if hasattr(layer, "w13_weight_offset"): layer.w13_weight_offset = torch.nn.Parameter( - layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + layer.w13_weight_offset.data.squeeze(-1).contiguous(), + requires_grad=False, ) if hasattr(layer, "w2_weight_offset"): layer.w2_weight_offset = torch.nn.Parameter( - layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + layer.w2_weight_offset.data.squeeze(-1).contiguous(), + requires_grad=False, ) layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 3ec4a45f43f9..9e73f5ac10ef 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -12,12 +12,12 @@ from compressed_tensors.quantization import QuantizationStrategy from sglang.srt.distributed import get_tensor_model_parallel_world_size -from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig -from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType -from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( NPUW8A8Int8DynamicMoEMethod, ) +from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig +from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType +from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase from sglang.srt.layers.quantization.compressed_tensors.schemes import ( WNA16_SUPPORTED_BITS, @@ -35,7 +35,7 @@ replace_parameter, swizzle_blockscale, ) -from sglang.srt.utils import get_bool_env_var, is_cuda, is_npu, is_hip, set_weight_attrs +from sglang.srt.utils import get_bool_env_var, is_cuda, is_hip, is_npu, set_weight_attrs if TYPE_CHECKING: from sglang.srt.layers.moe.fused_moe_triton import FusedMoE @@ -71,8 +71,7 @@ class GPTQMarlinState(Enum): "CompressedTensorsMoEMethod", "CompressedTensorsW4A4Nvfp4MoEMethod", "CompressedTensorsW8A8Fp8MoEMethod", - "NPUCompressedTensorsW8A8Int8MoEMethod" - "CompressedTensorsWNA16MoEMethod", + "NPUCompressedTensorsW8A8Int8MoEMethod" "CompressedTensorsWNA16MoEMethod", ] @@ -108,7 +107,9 @@ def get_moe_method( logger.info_once("Using NPUCompressedTensorsW8A8Int8MoEMethod") return NPUCompressedTensorsW8A8Int8MoEMethod(quant_config) else: - raise NotImplementedError(f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now.") + raise NotImplementedError( + f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now." + ) else: raise RuntimeError( f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}" diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 73c24aec92e5..d307f6b01c33 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -9,7 +9,6 @@ from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( NPUW8A8Int8DynamicLinearMethod, - NPUW8A8Int8LinearMethod, ) from sglang.srt.layers.parameter import ( ChannelQuantScaleParameter, From 5ca19cb80e56f50861903694674cf68e399141cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:20:54 +0300 Subject: [PATCH 039/175] Delete comments --- .../quantization/msmodelslim/msmodelslim.py | 115 ------------------ 1 file changed, 115 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index 7825b3fd2027..111fda21c02e 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -202,52 +202,6 @@ def _get_scheme_from_parts( quant_config=self.quant_description, prefix=layer_name ) - # Detect If Mixed Precision - # if self._is_wNa16_group_channel(weight_quant, input_quant): - # if ( - # self.quant_format == CompressionFormat.pack_quantized.value - # and weight_quant.num_bits in WNA16_SUPPORTED_BITS - # ): - # return CompressedTensorsWNA16( - # num_bits=weight_quant.num_bits, - # strategy=weight_quant.strategy, - # group_size=weight_quant.group_size, - # actorder=weight_quant.actorder, - # ) - # else: - # raise ImportError( - # "Other method (CompressedTensorsW4A16Sparse24) is not supported now" - # ) - - # if is_activation_quantization_format(self.quant_format): - # if self._is_fp8_w8a8(weight_quant, input_quant): - # is_fp8_w8a8_supported = self._check_scheme_supported( - # CompressedTensorsW8A8Fp8.get_min_capability(), error=False - # ) - # if is_fp8_w8a8_supported: - # return CompressedTensorsW8A8Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=( - # input_quant and not input_quant.dynamic - # ), - # ) - # else: - # # note: input_quant will be present for converted models; - # # will be ignored during inference post loading - # return CompressedTensorsW8A16Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=not input_quant.dynamic, - # ) - - # # note: input_quant can be None - # if self._is_fp8_w8a16(weight_quant, input_quant): - # is_static_input_scheme = input_quant and not input_quant.dynamic - # return CompressedTensorsW8A16Fp8( - # strategy=weight_quant.strategy, - # is_static_input_scheme=is_static_input_scheme, - # ) - - # raise NotImplementedError("No msmodelslim compatible scheme was found.") def get_scheme( self, layer: torch.nn.Module, layer_name: Optional[str] = None @@ -256,19 +210,6 @@ def get_scheme( get_scheme method adjusted for modelslim, taken from python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py """ - # if self.target_scheme_map: - # matched_target = find_matched_target( - # layer_name=layer_name, - # module=layer, - # targets=self.target_scheme_map.keys(), - # fused_mapping=self.packed_modules_mapping, - # ) - - # scheme_dict = self.target_scheme_map[matched_target] - # weight_quant = scheme_dict.get("weights") - # input_quant = scheme_dict.get("input_activations") - # else: - # Find the quant_scheme scheme = self._get_scheme_from_parts( # type: ignore # weight_quant=weight_quant, # input_quant=input_quant, @@ -314,62 +255,6 @@ def is_layer_skipped( def get_scaled_act_names(self) -> List[str]: return [] - # def is_dynamic_token_w4(self, weight_quant, input_quant) -> bool: - # is_w4 = weight_quant.num_bits == 4 - # weight_strategy = ( - # weight_quant.strategy == QuantizationStrategy.TENSOR.value - # or weight_quant.strategy == QuantizationStrategy.CHANNEL.value - # or weight_quant.strategy == QuantizationStrategy.GROUP.value - # ) - # if input_quant is not None: - # is_token = ( - # weight_strategy - # and input_quant.strategy == QuantizationStrategy.TOKEN.value - # ) - # is_dynamic = not weight_quant.dynamic and input_quant.dynamic - # else: - # is_token = weight_strategy - # is_dynamic = not weight_quant.dynamic - - # # Both symmetric and asymmetric input quantization supported. - # # Only symmetric weight quantization supported. - # return is_w4 and weight_quant.symmetric and is_token and is_dynamic - - # def _is_static_tensor_w8a8( - # self, weight_quant: BaseModel, input_quant: BaseModel - # ) -> bool: - # is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 - # weight_strategy = ( - # weight_quant.strategy == QuantizationStrategy.TENSOR.value - # or weight_quant.strategy == QuantizationStrategy.CHANNEL.value - # ) - # is_tensor = ( - # weight_strategy - # and input_quant.strategy == QuantizationStrategy.TENSOR.value - # ) - # is_static = not weight_quant.dynamic and not input_quant.dynamic - - # # Both symmetric and asymmetric input quantization supported. - # # Only symmetric weight quantization supported. - # return is_8_bits and is_tensor and weight_quant.symmetric and is_static - - # def _is_dynamic_token_w8a8( - # self, weight_quant: BaseModel, input_quant: BaseModel - # ) -> bool: - # is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 - # weight_strategy = ( - # weight_quant.strategy == QuantizationStrategy.TENSOR.value - # or weight_quant.strategy == QuantizationStrategy.CHANNEL.value - # ) - # is_token = ( - # weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value - # ) - # is_dynamic = not weight_quant.dynamic and input_quant.dynamic - - # # Both symmetric and asymmetric input quantization supported. - # # Only symmetric weight quantization supported. - # return is_8_bits and is_token and weight_quant.symmetric and is_dynamic - class ModelSlimLinearMethod(_NPULinearMethodBase): From 1f18881992df3496987194507cb3f0af3fefd5b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:22:05 +0300 Subject: [PATCH 040/175] Delete comments --- .../quantization/msmodelslim/msmodelslim.py | 31 ++----------------- 1 file changed, 2 insertions(+), 29 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index 111fda21c02e..78e5b2d66ce8 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -6,12 +6,7 @@ import torch -# from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( -# NPUW4A8Int4DynamicMoEMethod, -# NPUW4A16Int4DynamicMoEMethod, -# NPUW8A8Int8DynamicMoEMethod, -# ) -from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( # NPUW8A8Int8DynamicLinearMethod,; NPUW8A8Int8LinearMethod, +from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import ( _NPULinearMethodBase, ) from sglang.srt.layers.quantization.base_config import ( @@ -28,7 +23,6 @@ ModelSlimW8A8Int8, ) -# from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod from sglang.srt.utils import apply_module_patch @@ -90,20 +84,6 @@ def __init__(self, quant_config: Dict[str, Any] = {}): self.packed_modules_mapping = ( packed_modules_mapping if packed_modules_mapping is not None else {} ) - # self.target_scheme_map = ( - # CompressedTensorsConfig._quantization_scheme_map_from_config( - # config=quant_config - # ) - # ) - # target = "MoEGMM" if "MoEGMM" in self.target_scheme_map else "Linear" - # target_scheme = self.target_scheme_map.get(target, None) - # if target_scheme is None: - # self.is_moe_w4_dynamic = False - # else: - # weight_quant = target_scheme.get("weights") - # input_quant = target_scheme.get("input_activations") - # self.is_moe_w4_dynamic = self.is_dynamic_token_w4(weight_quant, input_quant) - # self.is_moe_input_quant = input_quant for name in self.quant_description.keys(): if "norm.bias" in name: @@ -169,10 +149,6 @@ def get_quant_method( prefix_in_quant_config = prefix.replace( proj_name, packed_modules_mapping_subset[proj_name][0] ) - # self.is_dynamic = ( - # self.quant_description[prefix_in_quant_config + ".weight"] - # == "W8A8_DYNAMIC" - # ) if self.is_layer_skipped(prefix, packed_modules_mapping_subset): return UnquantizedLinearMethod() @@ -210,14 +186,11 @@ def get_scheme( get_scheme method adjusted for modelslim, taken from python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py """ - scheme = self._get_scheme_from_parts( # type: ignore - # weight_quant=weight_quant, - # input_quant=input_quant, + scheme = self._get_scheme_from_parts( layer_name=layer_name, ) # Ascend doesn't support device capability - # self._check_scheme_supported(scheme.get_min_capability()) logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name) return scheme From 2bee5c7f3ddb396bf5b92fd908843babb7e882f2 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:37:07 +0300 Subject: [PATCH 041/175] Update model_config.py --- python/sglang/srt/configs/model_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 5e5eba62a295..ef7bc4bdcc3a 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -727,8 +727,8 @@ def _verify_quantization(self) -> None: # Parse quantization method from the HF and ModelSlim model config, if available. # Only one function should return config, other should return None. cfg_list = [] - cfg_list.append(self._parse_quant_hf_config) - cfg_list.append(self._find_quant_modelslim_config) + cfg_list.append(self._parse_quant_hf_config()) + cfg_list.append(self._find_quant_modelslim_config()) # Filter out None values cfg_list = [item for item in cfg_list if item is not None] From 2670aa96c74fc3e144ff577790350901a30d92d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Wed, 17 Dec 2025 17:01:48 +0300 Subject: [PATCH 042/175] Quickfix --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 71ab140c1f02..9b4ad95dbd12 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -344,8 +344,7 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: layer.w13_weight.data = cls.pack_to_int32(layer.w13_weight.data) layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data) - staticmethod - + @staticmethod def apply( layer, dispatch_output: "StandardDispatchOutput", From 1e45ead569332edd54bb6611786d0bf6695061d6 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Wed, 17 Dec 2025 17:38:18 +0300 Subject: [PATCH 043/175] Update fused_moe_method_npu.py --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 9b4ad95dbd12..ce17188d71c8 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -431,7 +431,6 @@ def apply( @staticmethod def apply_without_routing_weights( - cls, layer, hidden_states, hidden_states_scale, From afc11a67f5d0bd9159b4a21d8ba2441313f88d06 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Wed, 17 Dec 2025 19:59:29 +0300 Subject: [PATCH 044/175] Update CODEOWNERS --- .github/CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 898a44404431..d86c5a9519d7 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -15,6 +15,7 @@ /python/sglang/srt/function_call @CatherineSue @JustinTong0323 /python/sglang/srt/grpc @CatherineSue @slin1237 /python/sglang/srt/hardware_backend/npu @ping1jing2 @iforgetmyname +/python/sglang/srt/hardware_backend/npu/quantization @OrangeRedeng @TamirBaydasov @iforgetmyname /python/sglang/srt/layers @merrymercy @Ying1123 @Fridge003 @ispobock @HaiShaw @ch-wan @BBuf @Edwardf0t1 /python/sglang/srt/layers/attention/fla @yizhang2077 @hebiao064 /python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py @yizhang2077 @hebiao064 @hanming-lu From 168b2a84aa86bc3d8a6125ffad9dd420fe718c80 Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Wed, 17 Dec 2025 20:31:51 +0300 Subject: [PATCH 045/175] Pre-commit fixes --- .github/CODEOWNERS | 2 +- .../sglang/srt/layers/quantization/msmodelslim/msmodelslim.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d86c5a9519d7..e88cfe589ab6 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -15,7 +15,7 @@ /python/sglang/srt/function_call @CatherineSue @JustinTong0323 /python/sglang/srt/grpc @CatherineSue @slin1237 /python/sglang/srt/hardware_backend/npu @ping1jing2 @iforgetmyname -/python/sglang/srt/hardware_backend/npu/quantization @OrangeRedeng @TamirBaydasov @iforgetmyname +/python/sglang/srt/hardware_backend/npu/quantization @OrangeRedeng @TamirBaydasov @iforgetmyname /python/sglang/srt/layers @merrymercy @Ying1123 @Fridge003 @ispobock @HaiShaw @ch-wan @BBuf @Edwardf0t1 /python/sglang/srt/layers/attention/fla @yizhang2077 @hebiao064 /python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py @yizhang2077 @hebiao064 @hanming-lu diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index 78e5b2d66ce8..1ba64f7c4601 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -22,7 +22,6 @@ ModelSlimW4A4Int4, ModelSlimW8A8Int8, ) - from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod from sglang.srt.utils import apply_module_patch @@ -178,7 +177,6 @@ def _get_scheme_from_parts( quant_config=self.quant_description, prefix=layer_name ) - def get_scheme( self, layer: torch.nn.Module, layer_name: Optional[str] = None ) -> Optional[ModelSlimScheme]: From d5516526e4977e02c3e9261af6750cc7cabfa26d Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 18 Dec 2025 12:31:03 +0300 Subject: [PATCH 046/175] Update msmodelslim_w8a8_int8.py --- .../quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py index de99c9fed0b7..c462b2a66bea 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py @@ -25,7 +25,7 @@ def __init__( prefix: str, ): self.quant_config = quant_config - self.is_dynamic = self.quant_config[prefix + ".weight"] == "W8A8_DYNAMIC" + self.is_dynamic = self.quant_config.get(prefix + ".weight", "") == "W8A8_DYNAMIC" def create_weights( self, From 1cf18c0f7b3ab597bc24621aed607c7945b961dd Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 18 Dec 2025 12:32:25 +0300 Subject: [PATCH 047/175] Update msmodelslim.py --- .../srt/layers/quantization/msmodelslim/msmodelslim.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index 1ba64f7c4601..5eb341415d1e 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -167,7 +167,7 @@ def _get_scheme_from_parts( layer_name: str, ) -> ModelSlimScheme: - quant_type = self.quant_description[layer_name + ".weight"] + quant_type = self.quant_description.get(layer_name + ".weight", "") if quant_type == "W8A8_DYNAMIC" or quant_type == "W8A8": return ModelSlimW8A8Int8( quant_config=self.quant_description, prefix=layer_name @@ -206,7 +206,7 @@ def is_layer_skipped( is_skipped = None for shard_prefix in shard_prefixes: is_shard_skipped = ( - self.quant_description[shard_prefix + ".weight"] == "FLOAT" + self.quant_description.get(shard_prefix + ".weight", "") == "FLOAT" ) if is_skipped is None: @@ -218,7 +218,7 @@ def is_layer_skipped( "to have the same precision." ) else: - is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT" + is_skipped = self.quant_description.get(prefix + ".weight", "") == "FLOAT" assert is_skipped is not None return is_skipped From 3dccf89b5a5138826bc6ffb60bd58dab4889b22a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 14:08:02 +0300 Subject: [PATCH 048/175] Delete python/sglang/srt/hardware_backend/npu/quantization/modelslim.py --- python/sglang/srt/hardware_backend/npu/quantization/modelslim.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/modelslim.py diff --git a/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py b/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py deleted file mode 100644 index e69de29bb2d1..000000000000 From 1842d0a521e4c15e38a7b237496c01e0b978d2a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 14:15:02 +0300 Subject: [PATCH 049/175] Removed unused code --- .../sglang/srt/layers/quantization/msmodelslim/msmodelslim.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index 5eb341415d1e..dc43e6b79b5d 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -75,8 +75,6 @@ class ModelSlimConfig(QuantizationConfig): def __init__(self, quant_config: Dict[str, Any] = {}): super().__init__() self.quant_description = quant_config - # self.is_dynamic = quant_config.get("is_dynamic", False) - # self.is_moe_w4_dynamic = False ignore = cast(List[str], quant_config.get("ignore", [])) self.ignore = ignore if ignore is not None else [] packed_modules_mapping = quant_config.get("packed_modules_mapping", {}) From 75de787a1087c42b9a7001fb80064e8d8cf78d94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 16:28:54 +0300 Subject: [PATCH 050/175] Remove --quantization modelslim flag from doc --- docs/platforms/ascend_npu_deepseek_example.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docs/platforms/ascend_npu_deepseek_example.md b/docs/platforms/ascend_npu_deepseek_example.md index acb864ef568e..08bc98613c23 100644 --- a/docs/platforms/ascend_npu_deepseek_example.md +++ b/docs/platforms/ascend_npu_deepseek_example.md @@ -30,7 +30,6 @@ python3 -m sglang.launch_server \ --trust-remote-code \ --attention-backend ascend \ --device npu \ - --quantization modelslim \ --watchdog-timeout 9000 \ --host 127.0.0.1 \ --port 6688 \ @@ -89,7 +88,6 @@ python -m sglang.launch_server \ --mem-fraction-static 0.6 \ --attention-backend ascend \ --device npu \ - --quantization modelslim \ --disaggregation-transfer-backend ascend \ --max-running-requests 8 \ --context-length 8192 \ @@ -145,7 +143,6 @@ python -m sglang.launch_server \ --max-running-requests 352 \ --attention-backend ascend \ --device npu \ - --quantization modelslim \ --moe-a2a-backend deepep \ --enable-dp-attention \ --deepep-mode low_latency \ @@ -214,7 +211,6 @@ do --mem-fraction-static 0.81 \ --attention-backend ascend \ --device npu \ - --quantization modelslim \ --disaggregation-transfer-backend ascend \ --max-running-requests 8 \ --context-length 8192 \ @@ -275,7 +271,6 @@ do --max-running-requests 832 \ --attention-backend ascend \ --device npu \ - --quantization modelslim \ --moe-a2a-backend deepep \ --enable-dp-attention \ --deepep-mode low_latency \ From e9587675cd85f2989a5d6eca20702c693a2d7273 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 16:30:42 +0300 Subject: [PATCH 051/175] Delete --quantization "modelslim" flag --- test/srt/ascend/test_ascend_deepep.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/srt/ascend/test_ascend_deepep.py b/test/srt/ascend/test_ascend_deepep.py index c5f2cb6faa4a..8822fc2f8278 100644 --- a/test/srt/ascend/test_ascend_deepep.py +++ b/test/srt/ascend/test_ascend_deepep.py @@ -34,8 +34,6 @@ def setUpClass(cls): "--trust-remote-code", "--attention-backend", "ascend", - "--quantization", - "modelslim", "--mem-fraction-static", 0.8, "--disable-radix-cache", From 15678852e1ec86f1892169412bc4983678dd4692 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 16:31:05 +0300 Subject: [PATCH 052/175] Delete --quantization "modelslim" flag --- test/srt/ascend/test_ascend_deepseek_mtp.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/srt/ascend/test_ascend_deepseek_mtp.py b/test/srt/ascend/test_ascend_deepseek_mtp.py index 43089f885e97..d08329481d4f 100644 --- a/test/srt/ascend/test_ascend_deepseek_mtp.py +++ b/test/srt/ascend/test_ascend_deepseek_mtp.py @@ -32,8 +32,6 @@ def setUpClass(cls): "--trust-remote-code", "--attention-backend", "ascend", - "--quantization", - "modelslim", "--mem-fraction-static", 0.8, "--disable-radix-cache", From d34cb6fba083f62ac9891a0b7b0b7941b67cee8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 16:31:23 +0300 Subject: [PATCH 053/175] Update test_ascend_hicache_mla.py --- test/srt/ascend/test_ascend_hicache_mla.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/srt/ascend/test_ascend_hicache_mla.py b/test/srt/ascend/test_ascend_hicache_mla.py index 5e7c711e868d..d0bc1f378cfa 100644 --- a/test/srt/ascend/test_ascend_hicache_mla.py +++ b/test/srt/ascend/test_ascend_hicache_mla.py @@ -35,8 +35,6 @@ def setUpClass(cls): 0.8, "--attention-backend", "ascend", - "--quantization", - "modelslim", "--tp-size", 4, "--enable-hierarchical-cache", From 09a6d445795229410324feb652f8b965554fc261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 16:32:08 +0300 Subject: [PATCH 054/175] Delete --quantization "modelslim" flag --- test/srt/ascend/test_ascend_mla_fia_w8a8int8.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/srt/ascend/test_ascend_mla_fia_w8a8int8.py b/test/srt/ascend/test_ascend_mla_fia_w8a8int8.py index 1a0eb7f6dd05..bdab4ea05781 100644 --- a/test/srt/ascend/test_ascend_mla_fia_w8a8int8.py +++ b/test/srt/ascend/test_ascend_mla_fia_w8a8int8.py @@ -37,8 +37,6 @@ def setUpClass(cls): 0.8, "--attention-backend", "ascend", - "--quantization", - "modelslim", "--tp-size", 2, "--disable-radix-cache", From 2b7003e7f77b1507a48d6bc9fa5499840b70a32d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 16:32:32 +0300 Subject: [PATCH 055/175] Update test_ascend_mla_w8a8int8.py --- test/srt/ascend/test_ascend_mla_w8a8int8.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/srt/ascend/test_ascend_mla_w8a8int8.py b/test/srt/ascend/test_ascend_mla_w8a8int8.py index eddae3086c6d..3c3e733669ea 100644 --- a/test/srt/ascend/test_ascend_mla_w8a8int8.py +++ b/test/srt/ascend/test_ascend_mla_w8a8int8.py @@ -36,8 +36,6 @@ def setUpClass(cls): 0.8, "--attention-backend", "ascend", - "--quantization", - "modelslim", "--tp-size", 4, "--disable-radix-cache", From 43b5d66d15fae0678e5021b8fbdfd2718aca9d98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:07:37 +0300 Subject: [PATCH 056/175] Create README.md for msModelSlim --- .../layers/quantization/msmodelslim/README.md | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/README.md diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md new file mode 100644 index 000000000000..9eaa9ab248b0 --- /dev/null +++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md @@ -0,0 +1,57 @@ +Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module. + +MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as: +- [x] W4A4 dynamic linear +- [x] W8A8 static linear +- [x] W8A8 dynamic linear +- [x] W4A8 dynamic MOE +- [x] W8A8 dynamic MOE + +Also MsModelSlim module include: +- [x] Automated config detection for modelslim format (without the need to specify --quantization modelslim flag) +- [x] Unit-tests for w4a4 modelslim, w8a8 modelslim + +Examples of launch: +server: +`SGLANG_SET_CPU_AFFINITY=1 +PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +STREAMS_PER_DEVICE=32 +HCCL_BUFFSIZE=1536 +ENABLE_ASCEND_MOE_NZ=1 +ASCEND_RT_VISIBLE_DEVICES=0,1 +python3 -m sglang.launch_server --device npu --attention-backend ascend --trust-remote-code --tp-size 2 --model-path *model* --port 30088 --mem-fraction-static 0.8 --cuda-graph-max-bs 16` + +client: +`python ./benchmark/gsm8k/bench_sglang.py --num-questions 1319 --port 30088 --data-path ../gsm8k/test.jsonl --parallel 16` + + +Qwen3-32B-W4A4 from msmodelslim (dynamic) - Ascend 910B2 +image + +Qwen3-32B-W8A8 from msmodelslim (static) - Ascend 910B4 +image + +Qwen3-32B-W8A8 from msmodelslim (dynamic) - Ascend 910B2 +image + +Qwen3-30B-W8A8 from msmodelslim (attn - static / mlp - dynamic) - Ascend 910B2 +image + +server: +`sysctl -w vm.swappiness=0 +sysctl -w kernel.numa_balancing=0 +sysctl -w kernel.sched_migration_cost_ns=50000 +export SGLANG_SET_CPU_AFFINITY=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export STREAMS_PER_DEVICE=32 +export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=32 +export HCCL_BUFFSIZE=1536 +export ENABLE_ASCEND_MOE_NZ=1 +export HCCL_OP_EXPANSION_MODE=AIV +export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3` +`python3 -m sglang.launch_server --model-path *model* --tp 4 --trust-remote-code --attention-backend ascend --device npu --host 127.0.0.1 --port 30088 --mem-fraction-static 0.8 --quantization modelslim --moe-a2a-backend deepep --deepep-mode auto` +client: +`python ./benchmark/gsm8k/bench_sglang.py --num-questions 1319 --port 30088 --data-path ../gsm8k/test.jsonl --parallel 16` + +Qwen3-30B-W8A8 from msmodelslim (attn - static / mlp - dynamic) with EP - Ascend 910C +image From 420d6e8ab27cc41ac30db92b481ebe1725b0f9a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:08:11 +0300 Subject: [PATCH 057/175] Update README.md --- python/sglang/srt/layers/quantization/msmodelslim/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md index 9eaa9ab248b0..3519a00c64ab 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/README.md +++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md @@ -12,6 +12,7 @@ Also MsModelSlim module include: - [x] Unit-tests for w4a4 modelslim, w8a8 modelslim Examples of launch: + server: `SGLANG_SET_CPU_AFFINITY=1 PYTORCH_NPU_ALLOC_CONF=expandable_segments:True From f79f9eed1508d8c9fbcd5e12bd493fcd019afb37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:08:28 +0300 Subject: [PATCH 058/175] Update README.md --- python/sglang/srt/layers/quantization/msmodelslim/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md index 3519a00c64ab..3eab5da35f55 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/README.md +++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md @@ -51,6 +51,7 @@ export ENABLE_ASCEND_MOE_NZ=1 export HCCL_OP_EXPANSION_MODE=AIV export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3` `python3 -m sglang.launch_server --model-path *model* --tp 4 --trust-remote-code --attention-backend ascend --device npu --host 127.0.0.1 --port 30088 --mem-fraction-static 0.8 --quantization modelslim --moe-a2a-backend deepep --deepep-mode auto` + client: `python ./benchmark/gsm8k/bench_sglang.py --num-questions 1319 --port 30088 --data-path ../gsm8k/test.jsonl --parallel 16` From a7c43bb1b7244946b39be6fef1ad1e28ffaafd9d Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:16:27 +0300 Subject: [PATCH 059/175] Update fused_moe_method_npu.py 1/4 W4A16 refactoring --- .../npu/quantization/fused_moe_method_npu.py | 135 ++---------------- 1 file changed, 12 insertions(+), 123 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index ce17188d71c8..9686ce2b4f5a 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -475,116 +475,8 @@ def apply_without_routing_weights( class NPUW4A16Int4DynamicMoEMethod(FusedMoEMethodBase): - def __init__(self, quantization_config) -> None: - self.pack_factor = 8 # weight dtype is int4, but use int32 to create - target = ( - "MoEGMM" if "MoEGMM" in quantization_config.target_scheme_map else "Linear" - ) - if target in quantization_config.target_scheme_map: - self.group_size = quantization_config.target_scheme_map[target][ - "weights" - ].group_size - else: - self.group_size = 128 - - def create_weights( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ) -> None: - from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported - - self.num_experts = num_experts - if ( - extra_weight_attrs.get( - "intermediate_size_full", intermediate_size_per_partition - ) - // intermediate_size_per_partition - > 1 - ): - quant_method = FusedMoeWeightScaleSupported.GROUP.value - else: - quant_method = FusedMoeWeightScaleSupported.CHANNEL.value - extra_weight_attrs.update({"quant_method": quant_method}) - # weight - w13_weight = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size // self.pack_factor, - dtype=torch.int32, - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight", w13_weight) - set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition // self.pack_factor, - dtype=torch.int32, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight", w2_weight) - set_weight_attrs(w2_weight, extra_weight_attrs) - - # scale - weight_scale_dtype = torch.bfloat16 - w13_weight_scale = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size // self.group_size, - dtype=weight_scale_dtype, - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight_scale", w13_weight_scale) - set_weight_attrs(w13_weight_scale, extra_weight_attrs) - w2_weight_scale = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition // self.group_size, - dtype=weight_scale_dtype, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight_scale", w2_weight_scale) - set_weight_attrs(w2_weight_scale, extra_weight_attrs) - - # offset - w13_weight_offset = torch.nn.Parameter( - torch.zeros( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size // self.group_size, - dtype=weight_scale_dtype, - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight_offset", w13_weight_offset) - set_weight_attrs(w13_weight_offset, extra_weight_attrs) - - w2_weight_offset = torch.nn.Parameter( - torch.zeros( - num_experts, - hidden_size, - intermediate_size_per_partition // self.group_size, - dtype=weight_scale_dtype, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight_offset", w2_weight_offset) - set_weight_attrs(w2_weight_offset, extra_weight_attrs) - - def pack_to_int32(self, weight: torch.Tensor): + @classmethod + def pack_to_int32(cls, weight: torch.Tensor): assert weight.dim() == 3 if weight.dtype == torch.int32: # pack 8 int4 to int32, we use a int32 to represent a int4 @@ -605,8 +497,9 @@ def pack_to_int32(self, weight: torch.Tensor): raise ValueError(f"{weight.dtype=} is not supported !") return new_weight + @classmethod def unpack_from_int32( - self, + cls, value: torch.Tensor, num_bits: int, shape: torch.Size = None, @@ -669,7 +562,8 @@ def unpack_from_int32( return unpacked - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + @classmethod + def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: w13_weight_scale = layer.w13_weight_scale.data.transpose(-1, -2).contiguous() w2_weight_scale = layer.w2_weight_scale.data.transpose(-1, -2).contiguous() layer.w13_weight_scale = torch.nn.Parameter( @@ -690,33 +584,28 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # w13_weight = layer.w13_weight.data.transpose(1, 2).contiguous() # w2_weight = layer.w2_weight.data.transpose(1, 2).contiguous() unpacked_w13_weight = ( - self.unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4) + cls.unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4) .view(layer.w13_weight.data.shape[0], layer.w13_weight.data.shape[1], -1) .transpose(1, 2) .contiguous() .int() ) unpacked_w2_weight = ( - self.unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4) + cls.unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4) .view(layer.w2_weight.data.shape[0], layer.w2_weight.data.shape[1], -1) .transpose(1, 2) .contiguous() .int() ) - w13_weight = self.pack_to_int32(unpacked_w13_weight) - w2_weight = self.pack_to_int32(unpacked_w2_weight) + w13_weight = cls.pack_to_int32(unpacked_w13_weight) + w2_weight = cls.pack_to_int32(unpacked_w2_weight) layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) - def create_moe_runner( - self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" - ): - self.moe_runner_config = moe_runner_config - + @staticmethod def apply( - self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": @@ -743,8 +632,8 @@ def apply( ) return StandardCombineInput(hidden_states=output) + @staticmethod def apply_without_routing_weights( - self, layer, hidden_states, hidden_states_scale, From ef2fdb839f8e3a584dcb89933ec7aaa635d1b792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:17:05 +0300 Subject: [PATCH 060/175] Update README.md --- python/sglang/srt/layers/quantization/msmodelslim/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md index 3eab5da35f55..27f0680f0a6b 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/README.md +++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md @@ -1,5 +1,7 @@ Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module. +`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be parsed from the downloaded `quant_model_description.json` config. + MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as: - [x] W4A4 dynamic linear - [x] W8A8 static linear From cb95c0a3d3344bcee2a9a9e0700b1a799660eaa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:17:41 +0300 Subject: [PATCH 061/175] Update README.md --- python/sglang/srt/layers/quantization/msmodelslim/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md index 27f0680f0a6b..8db5f10d61cf 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/README.md +++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md @@ -1,6 +1,6 @@ Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module. -`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be parsed from the downloaded `quant_model_description.json` config. +`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config. MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as: - [x] W4A4 dynamic linear From ca38c591205131689c743a60b775724f65347008 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:17:55 +0300 Subject: [PATCH 062/175] Update layer.py 2/4 W4A16 refactoring --- python/sglang/srt/layers/moe/ep_moe/layer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index 4f22ba798c84..7fbdcff31e85 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -6,8 +6,8 @@ import torch from sglang.srt.environ import envs -from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( - NPUW4A16Int4DynamicMoEMethod, +from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import( + NPUCompressedTensorsW4A16Int4DynamicMoEMethod, ) from sglang.srt.layers import deep_gemm_wrapper from sglang.srt.layers.moe import ( @@ -351,7 +351,7 @@ def forward_npu( else: input_quant = get_bool_env_var("DEEP_NORMAL_MODE_USE_INT8_QUANT") if not input_quant and not isinstance( - self.quant_method, NPUW4A16Int4DynamicMoEMethod + self.quant_method, NPUCompressedTensorsW4A16Int4DynamicMoEMethod ): hidden_states, hidden_states_scale = torch_npu.npu_dynamic_quant( hidden_states From 583cb4d3188c2aabc975da98208fe15a1b5aa0b1 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:18:46 +0300 Subject: [PATCH 063/175] Update compressed_tensors.py 3/4 W4A16 refactoring --- .../compressed_tensors/compressed_tensors.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index 171573f1b914..56b5f4beb52b 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -449,6 +449,29 @@ def _is_wNa16_group_channel( return is_channel_group and input_quant_none and is_symmetric and is_static + def _is_dynamic_token_w4( + self, weight_quant: BaseModel, input_quant: BaseModel + ) -> bool: + is_w4 = weight_quant.num_bits == 4 + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.TENSOR.value + or weight_quant.strategy == QuantizationStrategy.CHANNEL.value + or weight_quant.strategy == QuantizationStrategy.GROUP.value + ) + if input_quant is not None: + is_token = ( + weight_strategy + and input_quant.strategy == QuantizationStrategy.TOKEN.value + ) + is_dynamic = not weight_quant.dynamic and input_quant.dynamic + else: + is_token = weight_strategy + is_dynamic = not weight_quant.dynamic + + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_w4 and weight_quant.symmetric and is_token and is_dynamic + def _get_scheme_from_parts( self, weight_quant: BaseModel, input_quant: BaseModel ) -> CompressedTensorsScheme: From 8af003309d3ee19ef6e103598d14ee11918d7409 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:20:00 +0300 Subject: [PATCH 064/175] Update compressed_tensors_moe.py 4/4 W4A16 refactoring --- .../compressed_tensors_moe.py | 167 +++++++++++++++++- 1 file changed, 160 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 32ed5554736c..daedb41c3c37 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -13,6 +13,7 @@ from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( NPUW8A8Int8DynamicMoEMethod, + NPUW4A16Int4DynamicMoEMethod, ) from sglang.srt.distributed import get_tensor_model_parallel_world_size, get_tp_group from sglang.srt.distributed.device_communicators.pynccl_allocator import ( @@ -85,7 +86,9 @@ class GPTQMarlinState(Enum): "CompressedTensorsMoEMethod", "CompressedTensorsW4A4Nvfp4MoEMethod", "CompressedTensorsW8A8Fp8MoEMethod", - "NPUCompressedTensorsW8A8Int8MoEMethod" "CompressedTensorsWNA16MoEMethod", + "NPUCompressedTensorsW8A8Int8MoEMethod", + "CompressedTensorsWNA16MoEMethod", + "NPUCompressedTensorsW4A16Int4DynamicMoEMethod" ] @@ -108,8 +111,13 @@ def get_moe_method( input_quant = quant_config.target_scheme_map["Linear"].get("input_activations") if quant_config._is_wNa16_group_channel(weight_quant, input_quant): - logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod") - return CompressedTensorsWNA16MoEMethod(quant_config) + if _is_cuda or _is_hip: + logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod") + return CompressedTensorsWNA16MoEMethod(quant_config) + elif _is_npu: + if quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is None: + logger.info_once("Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod") + return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod") return CompressedTensorsW4A4Nvfp4MoEMethod(quant_config) @@ -118,8 +126,8 @@ def get_moe_method( return CompressedTensorsW8A8Fp8MoEMethod(quant_config) elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant): if _is_npu: - logger.info_once("Using NPUCompressedTensorsW8A8Int8MoEMethod") - return NPUCompressedTensorsW8A8Int8MoEMethod(quant_config) + logger.info_once("Using NPUCompressedTensorsW8A8Int8DynamicMoEMethod") + return NPUCompressedTensorsW8A8Int8DynamicMoEMethod(quant_config) else: raise NotImplementedError( f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now." @@ -866,7 +874,7 @@ def apply( return self.runner.run(dispatch_output, quant_info) -class NPUCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): +class NPUCompressedTensorsW8A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod): def __init__(self, quant_config: CompressedTensorsConfig): self.quant_config = quant_config @@ -969,7 +977,6 @@ def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig ): self.moe_runner_config = moe_runner_config - self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config) def apply( self, @@ -1286,3 +1293,149 @@ def apply( routed_scaling_factor=self.moe_runner_config.routed_scaling_factor, ) return StandardCombineInput(hidden_states=output) + + +class NPUCompressedTensorsW4A16Int4DynamicMoEMethod(CompressedTensorsMoEMethod): + + def __init__(self, quantization_config) -> None: + self.pack_factor = 8 # weight dtype is int4, but use int32 to create + target = ( + "MoEGMM" if "MoEGMM" in quantization_config.target_scheme_map else "Linear" + ) + if target in quantization_config.target_scheme_map: + self.group_size = quantization_config.target_scheme_map[target][ + "weights" + ].group_size + else: + self.group_size = 128 + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported + + self.num_experts = num_experts + if ( + extra_weight_attrs.get( + "intermediate_size_full", intermediate_size_per_partition + ) + // intermediate_size_per_partition + > 1 + ): + quant_method = FusedMoeWeightScaleSupported.GROUP.value + else: + quant_method = FusedMoeWeightScaleSupported.CHANNEL.value + extra_weight_attrs.update({"quant_method": quant_method}) + # weight + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // self.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // self.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # scale + weight_scale_dtype = torch.bfloat16 + w13_weight_scale = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // self.group_size, + dtype=weight_scale_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + w2_weight_scale = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // self.group_size, + dtype=weight_scale_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # offset + w13_weight_offset = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // self.group_size, + dtype=weight_scale_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_offset", w13_weight_offset) + set_weight_attrs(w13_weight_offset, extra_weight_attrs) + + w2_weight_offset = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size, + intermediate_size_per_partition // self.group_size, + dtype=weight_scale_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_offset", w2_weight_offset) + set_weight_attrs(w2_weight_offset, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + NPUW4A16Int4DynamicMoEMethod.process_weights_after_loading(layer) + + def create_moe_runner( + self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig + ): + self.moe_runner_config = moe_runner_config + + def apply( + self, + layer: torch.nn.Module, + dispatch_output: StandardDispatchOutput, + ) -> CombineInput: + + return NPUW4A16Int4DynamicMoEMethod.apply(layer, dispatch_output) + + def apply_without_routing_weights( + self, + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ): + return NPUW4A16Int4DynamicMoEMethod.apply_without_routing_weights( + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ) From 9f8c40708ecac06a833f6f0d11e1a161e24769c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:24:23 +0300 Subject: [PATCH 065/175] Quickfix --- .../compressed_tensors/compressed_tensors.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index 56b5f4beb52b..d69a222fb836 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -533,33 +533,35 @@ def _get_scheme_from_parts( ) if self._is_static_tensor_w8a8(weight_quant, input_quant): - if _is_npu: - return NPUCompressedTensorsW8A8Int8( - strategy=weight_quant.strategy, - is_static_input_scheme=True, - input_symmetric=input_quant.symmetric, - ) - else: + if _is_cuda: return GPUCompressedTensorsW8A8Int8( strategy=weight_quant.strategy, is_static_input_scheme=True, input_symmetric=input_quant.symmetric, - ) + ) + elif _is_npu: + return NPUCompressedTensorsW8A8Int8( + strategy=weight_quant.strategy, + is_static_input_scheme=True, + input_symmetric=input_quant.symmetric, + ) if self._is_dynamic_token_w8a8(weight_quant, input_quant): - if _is_npu: - return NPUCompressedTensorsW8A8Int8( + if _is_cuda: + return GPUCompressedTensorsW8A8Int8( strategy=weight_quant.strategy, is_static_input_scheme=False, input_symmetric=input_quant.symmetric, ) - else: - return GPUCompressedTensorsW8A8Int8( + elif _is_npu: + return NPUCompressedTensorsW8A8Int8( strategy=weight_quant.strategy, is_static_input_scheme=False, input_symmetric=input_quant.symmetric, ) + + raise NotImplementedError("No compressed-tensors compatible scheme was found.") def get_scheme( From 72efd3ac02f0f5c753966a0dd62d6b9481e02cc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:10:23 +0300 Subject: [PATCH 066/175] Update README.md --- .../layers/quantization/msmodelslim/README.md | 47 ------------------- 1 file changed, 47 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md index 8db5f10d61cf..c4a5a8b1f14f 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/README.md +++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md @@ -12,50 +12,3 @@ MsModelSlim was developed in the format of compressed_tensors and includes suppo Also MsModelSlim module include: - [x] Automated config detection for modelslim format (without the need to specify --quantization modelslim flag) - [x] Unit-tests for w4a4 modelslim, w8a8 modelslim - -Examples of launch: - -server: -`SGLANG_SET_CPU_AFFINITY=1 -PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -STREAMS_PER_DEVICE=32 -HCCL_BUFFSIZE=1536 -ENABLE_ASCEND_MOE_NZ=1 -ASCEND_RT_VISIBLE_DEVICES=0,1 -python3 -m sglang.launch_server --device npu --attention-backend ascend --trust-remote-code --tp-size 2 --model-path *model* --port 30088 --mem-fraction-static 0.8 --cuda-graph-max-bs 16` - -client: -`python ./benchmark/gsm8k/bench_sglang.py --num-questions 1319 --port 30088 --data-path ../gsm8k/test.jsonl --parallel 16` - - -Qwen3-32B-W4A4 from msmodelslim (dynamic) - Ascend 910B2 -image - -Qwen3-32B-W8A8 from msmodelslim (static) - Ascend 910B4 -image - -Qwen3-32B-W8A8 from msmodelslim (dynamic) - Ascend 910B2 -image - -Qwen3-30B-W8A8 from msmodelslim (attn - static / mlp - dynamic) - Ascend 910B2 -image - -server: -`sysctl -w vm.swappiness=0 -sysctl -w kernel.numa_balancing=0 -sysctl -w kernel.sched_migration_cost_ns=50000 -export SGLANG_SET_CPU_AFFINITY=1 -export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export STREAMS_PER_DEVICE=32 -export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=32 -export HCCL_BUFFSIZE=1536 -export ENABLE_ASCEND_MOE_NZ=1 -export HCCL_OP_EXPANSION_MODE=AIV -export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3` -`python3 -m sglang.launch_server --model-path *model* --tp 4 --trust-remote-code --attention-backend ascend --device npu --host 127.0.0.1 --port 30088 --mem-fraction-static 0.8 --quantization modelslim --moe-a2a-backend deepep --deepep-mode auto` - -client: -`python ./benchmark/gsm8k/bench_sglang.py --num-questions 1319 --port 30088 --data-path ../gsm8k/test.jsonl --parallel 16` - -Qwen3-30B-W8A8 from msmodelslim (attn - static / mlp - dynamic) with EP - Ascend 910C -image From 384835b500bbbd3e7557ba9a0ccbbadd3aa6d0cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:42:33 +0300 Subject: [PATCH 067/175] Update msmodelslim_moe.py --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 4b7b596c4f8d..e580fc3df306 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -69,7 +69,6 @@ def __init__( self.quant_config = quant_config self.group_size = 0 self.tp_size = 1 - self.is_per_channel_weight = self.group_size == 0 def create_weights( self, @@ -82,6 +81,7 @@ def create_weights( ) -> None: from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported + self.is_per_channel_weight = self.group_size == 0 self.num_experts = num_experts extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} @@ -141,9 +141,9 @@ def create_weights( ) layer.register_parameter("w2_weight_offset", w2_weight_offset) set_weight_attrs(w2_weight_offset, extra_weight_attrs) - + + # >>> special param for w4a8 if not self.is_per_channel_weight: - # >>> special param for w4a8 w13_weight_scale_second = torch.nn.Parameter( torch.empty( num_experts, From 4ebfb54f1eba1b01ec1743cf6e333bc23452fd78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:43:44 +0300 Subject: [PATCH 068/175] Update fused_moe_method_npu.py --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 9686ce2b4f5a..8598ff650aa8 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -258,8 +258,7 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase): @classmethod def process_scale(cls, weight: torch.Tensor, scale, per_group_scale): scale = scale.transpose(1, 2).contiguous() - # if cls.is_per_channel_weight: - if True: + if cls.is_per_channel_weight: scale_np = scale.cpu().numpy() scale_np.dtype = np.uint32 scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu() From 0cfbd93666437fbfb89ded6b2b914322d1b22af6 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 18 Dec 2025 19:06:03 +0300 Subject: [PATCH 069/175] Create test_ascend_w4a4_quantization.py in srt/ascend 1/4 new CI tests --- .../ascend/test_ascend_w4a4_quantization.py | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 test/srt/ascend/test_ascend_w4a4_quantization.py diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py new file mode 100644 index 000000000000..c2251ec94a9d --- /dev/null +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -0,0 +1,108 @@ +""" +Usage: +python3 -m unittest test_ascend_w4a4_quantization.TestAscendW4A4.test_gsm8k +""" + +import os +import time +import unittest +from types import SimpleNamespace +from urllib.parse import urlparse + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_ci, + popen_launch_server, +) + +if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" +DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( + 7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 +) +DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" + + +class TestAscendW4A4(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = "/root/.cache/modelscope/hub/models/msit/Qwen3-8B-W4A4/" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--device", + "npu", + "--attention-backend", + "ascend", + "--tp-size", + "2", + "--mem-fraction-static", + "0.8", + "--cuda-graph-bs", + "64", + "--disable-radix-cache", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + base_url = DEFAULT_URL_FOR_TEST + url = urlparse(base_url) + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=128, + max_new_tokens=512, + parallel=64, + host=f"http://{url.hostname}", + port=int(url.port), + ) + metrics = run_eval(args) + print(metrics) + + self.assertGreaterEqual(metrics["accuracy"], 0.75) + self.assertGreaterEqual(metrics["output_throughput"], 700) + + def run_decode(self, max_new_tokens): + response = requests.post( + self.base_url + "/generate", + json={ + "text": "The capital of France is", + "sampling_params": { + "temperature": 0, + "max_new_tokens": max_new_tokens, + }, + "ignore_eos": True, + }, + ) + return response.json() + + def test_throughput(self): + max_tokens = 256 + + tic = time.perf_counter() + res = self.run_decode(max_tokens) + tok = time.perf_counter() + print(res["text"]) + throughput = max_tokens / (tok - tic) + print(f"Throughput: {throughput} tokens/s") + + if is_in_ci(): + self.assertGreaterEqual(throughput, 25) + + +if __name__ == "__main__": + unittest.main() From 87b65a820c49409f351d9c1c21f147f240cca5a5 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 18 Dec 2025 19:06:31 +0300 Subject: [PATCH 070/175] Delete test/manual/ascend/test_ascend_w4a4_quantization.py 2/4 new CI tests --- .../ascend/test_ascend_w4a4_quantization.py | 108 ------------------ 1 file changed, 108 deletions(-) delete mode 100644 test/manual/ascend/test_ascend_w4a4_quantization.py diff --git a/test/manual/ascend/test_ascend_w4a4_quantization.py b/test/manual/ascend/test_ascend_w4a4_quantization.py deleted file mode 100644 index c2251ec94a9d..000000000000 --- a/test/manual/ascend/test_ascend_w4a4_quantization.py +++ /dev/null @@ -1,108 +0,0 @@ -""" -Usage: -python3 -m unittest test_ascend_w4a4_quantization.TestAscendW4A4.test_gsm8k -""" - -import os -import time -import unittest -from types import SimpleNamespace -from urllib.parse import urlparse - -import requests - -from sglang.srt.utils import kill_process_tree -from sglang.test.few_shot_gsm8k import run_eval -from sglang.test.test_utils import ( - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - CustomTestCase, - is_in_ci, - popen_launch_server, -) - -if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" -DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( - 7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 -) -DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" - - -class TestAscendW4A4(CustomTestCase): - @classmethod - def setUpClass(cls): - cls.model = "/root/.cache/modelscope/hub/models/msit/Qwen3-8B-W4A4/" - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--trust-remote-code", - "--device", - "npu", - "--attention-backend", - "ascend", - "--tp-size", - "2", - "--mem-fraction-static", - "0.8", - "--cuda-graph-bs", - "64", - "--disable-radix-cache", - ], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_gsm8k(self): - base_url = DEFAULT_URL_FOR_TEST - url = urlparse(base_url) - args = SimpleNamespace( - num_shots=5, - data_path=None, - num_questions=128, - max_new_tokens=512, - parallel=64, - host=f"http://{url.hostname}", - port=int(url.port), - ) - metrics = run_eval(args) - print(metrics) - - self.assertGreaterEqual(metrics["accuracy"], 0.75) - self.assertGreaterEqual(metrics["output_throughput"], 700) - - def run_decode(self, max_new_tokens): - response = requests.post( - self.base_url + "/generate", - json={ - "text": "The capital of France is", - "sampling_params": { - "temperature": 0, - "max_new_tokens": max_new_tokens, - }, - "ignore_eos": True, - }, - ) - return response.json() - - def test_throughput(self): - max_tokens = 256 - - tic = time.perf_counter() - res = self.run_decode(max_tokens) - tok = time.perf_counter() - print(res["text"]) - throughput = max_tokens / (tok - tic) - print(f"Throughput: {throughput} tokens/s") - - if is_in_ci(): - self.assertGreaterEqual(throughput, 25) - - -if __name__ == "__main__": - unittest.main() From 177102dc660f38f78cf132bc42b38050919c02ad Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 18 Dec 2025 19:07:53 +0300 Subject: [PATCH 071/175] Create test_ascend_w8a8_quantization.py 3/4 new CI tests --- .../ascend/test_ascend_w8a8_quantization.py | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 test/srt/ascend/test_ascend_w8a8_quantization.py diff --git a/test/srt/ascend/test_ascend_w8a8_quantization.py b/test/srt/ascend/test_ascend_w8a8_quantization.py new file mode 100644 index 000000000000..f3f9cdff952b --- /dev/null +++ b/test/srt/ascend/test_ascend_w8a8_quantization.py @@ -0,0 +1,103 @@ +""" +Usage: +python3 -m unittest test_ascend_w8a8_quantization.TestAscendW8A8.test_gsm8k +""" + +import os +import time +import unittest +from types import SimpleNamespace +from urllib.parse import urlparse + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_ci, + popen_launch_server, +) + +if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" +DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( + 7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 +) +DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" + + +class TestAscendW8A8CompressedTensors(CustomTestCase): + @classmethod + def setUpClass(cls): + # TODO: Move model to CI or Modelscope + cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--disable-cuda-graph", + "--device", + "npu", + "--attention-backend", + "ascend", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + base_url = DEFAULT_URL_FOR_TEST + url = urlparse(base_url) + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host=f"http://{url.hostname}", + port=int(url.port), + ) + metrics = run_eval(args) + print(metrics) + + self.assertGreaterEqual(metrics["accuracy"], 0.3) + self.assertGreaterEqual(metrics["output_throughput"], 1000) + + def run_decode(self, max_new_tokens): + response = requests.post( + self.base_url + "/generate", + json={ + "text": "The capital of France is", + "sampling_params": { + "temperature": 0, + "max_new_tokens": max_new_tokens, + }, + "ignore_eos": True, + }, + ) + return response.json() + + def test_throughput(self): + max_tokens = 256 + + tic = time.perf_counter() + res = self.run_decode(max_tokens) + tok = time.perf_counter() + print(res["text"]) + throughput = max_tokens / (tok - tic) + print(f"Throughput: {throughput} tokens/s") + + if is_in_ci(): + self.assertGreaterEqual(throughput, 25) + + +if __name__ == "__main__": + unittest.main() From 16ca7733ae6825d0d5337a7692b089bd478f908e Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 18 Dec 2025 19:08:41 +0300 Subject: [PATCH 072/175] Update run_suite.py 4/4 new CI tests --- test/srt/run_suite.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 541d59b10901..89391e42aca1 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -361,12 +361,14 @@ TestFile("ascend/test_ascend_sampling_backend.py", 400), TestFile("ascend/test_ascend_tp1_bf16.py", 400), TestFile("ascend/test_ascend_compile_graph_tp1_bf16.py", 400), + TestFile("ascend/test_ascend_w8a8_quantization.py", 400), ], "per-commit-2-npu-a2": [ TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400), TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400), TestFile("ascend/test_ascend_tp2_bf16.py", 400), TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400), + TestFile("ascend/test_ascend_w4a4_quantization.py", 400), ], "per-commit-4-npu-a2": [ TestFile("ascend/test_ascend_mla_w8a8int8.py", 400), From c6def39082c0ad45cbeeae89cdacd4c4b37a9cda Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 18 Dec 2025 19:11:10 +0300 Subject: [PATCH 073/175] Update test_ascend_w8a8_quantization.py Remove compressed-tensors test, remove quantization flag --- .../ascend/test_ascend_w8a8_quantization.py | 72 ------------------- 1 file changed, 72 deletions(-) diff --git a/test/manual/ascend/test_ascend_w8a8_quantization.py b/test/manual/ascend/test_ascend_w8a8_quantization.py index 959bf88a513f..e013c150c314 100644 --- a/test/manual/ascend/test_ascend_w8a8_quantization.py +++ b/test/manual/ascend/test_ascend_w8a8_quantization.py @@ -45,8 +45,6 @@ def setUpClass(cls): "npu", "--attention-backend", "ascend", - "--quantization", - "w8a8_int8", ], ) @@ -100,75 +98,5 @@ def test_throughput(self): self.assertGreaterEqual(throughput, 25) -class TestAscendW8A8CompressedTensors(CustomTestCase): - @classmethod - def setUpClass(cls): - # TODO: Move model to CI or Modelscope - cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8" - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--trust-remote-code", - "--disable-cuda-graph", - "--device", - "npu", - "--attention-backend", - "ascend", - ], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_gsm8k(self): - base_url = DEFAULT_URL_FOR_TEST - url = urlparse(base_url) - args = SimpleNamespace( - num_shots=5, - data_path=None, - num_questions=200, - max_new_tokens=512, - parallel=128, - host=f"http://{url.hostname}", - port=int(url.port), - ) - metrics = run_eval(args) - print(metrics) - - self.assertGreaterEqual(metrics["accuracy"], 0.3) - self.assertGreaterEqual(metrics["output_throughput"], 1000) - - def run_decode(self, max_new_tokens): - response = requests.post( - self.base_url + "/generate", - json={ - "text": "The capital of France is", - "sampling_params": { - "temperature": 0, - "max_new_tokens": max_new_tokens, - }, - "ignore_eos": True, - }, - ) - return response.json() - - def test_throughput(self): - max_tokens = 256 - - tic = time.perf_counter() - res = self.run_decode(max_tokens) - tok = time.perf_counter() - print(res["text"]) - throughput = max_tokens / (tok - tic) - print(f"Throughput: {throughput} tokens/s") - - if is_in_ci(): - self.assertGreaterEqual(throughput, 25) - - if __name__ == "__main__": unittest.main() From d0dd42766f450070efae9a12c1c786e971261eee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 20:12:47 +0300 Subject: [PATCH 074/175] Create ascend_npu_quantization.md --- docs/platforms/ascend_npu_quantization.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 docs/platforms/ascend_npu_quantization.md diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md new file mode 100644 index 000000000000..053e12777b0b --- /dev/null +++ b/docs/platforms/ascend_npu_quantization.md @@ -0,0 +1,19 @@ +Quantization on Ascend. + +To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` or `config.json` config. + +MsModelSlim on Ascend support: +- [x] W4A4 dynamic linear +- [x] W8A8 static linear +- [x] W8A8 dynamic linear +- [x] W4A8 dynamic MOE +- [x] W8A8 dynamic MOE + +AWQ on Ascend support: +- [x] W4A16 linear +- [x] W8A16 MOE + +Compressed-tensors (LLM Compressor) on Ascend support: +- [x] W8A8 dynamic linear +- [x] W8A8 dynamic MOE +- [x] W4A16 MOE From 2e1219fe9a18ec3efa84289d10ef3495db427aed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 20:16:21 +0300 Subject: [PATCH 075/175] Bugfix --- .../quantization/compressed_tensors/compressed_tensors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index d69a222fb836..107ec12a8b11 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -54,11 +54,11 @@ ) from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod -from sglang.srt.utils import is_npu +from sglang.srt.utils import is_cuda, is_npu +_is_cuda = is_cuda() _is_npu = is_npu() - if TYPE_CHECKING: from sglang.srt.models.utils import WeightsMapper From 9d6ffbd6e68aad1a83d038a0bc972a9ea7e486ef Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Thu, 18 Dec 2025 20:17:20 +0300 Subject: [PATCH 076/175] Pre-commit fixes --- docs/platforms/ascend_npu_quantization.md | 6 +++--- .../npu/quantization/fused_moe_method_npu.py | 2 -- python/sglang/srt/layers/moe/ep_moe/layer.py | 6 +++--- .../compressed_tensors/compressed_tensors.py | 14 ++++++-------- .../compressed_tensors_moe.py | 19 ++++++++++++------- .../layers/quantization/msmodelslim/README.md | 2 +- .../msmodelslim/msmodelslim_moe.py | 2 +- .../schemes/msmodelslim_w8a8_int8.py | 4 +++- 8 files changed, 29 insertions(+), 26 deletions(-) diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md index 053e12777b0b..08623f6ffba3 100644 --- a/docs/platforms/ascend_npu_quantization.md +++ b/docs/platforms/ascend_npu_quantization.md @@ -5,15 +5,15 @@ To load already quantized models, simply load the model weights and config. Agai MsModelSlim on Ascend support: - [x] W4A4 dynamic linear - [x] W8A8 static linear -- [x] W8A8 dynamic linear +- [x] W8A8 dynamic linear - [x] W4A8 dynamic MOE - [x] W8A8 dynamic MOE AWQ on Ascend support: - [x] W4A16 linear - [x] W8A16 MOE - + Compressed-tensors (LLM Compressor) on Ascend support: -- [x] W8A8 dynamic linear +- [x] W8A8 dynamic linear - [x] W8A8 dynamic MOE - [x] W4A16 MOE diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 8598ff650aa8..17da9aaeadea 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -5,10 +5,8 @@ from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase -from sglang.srt.utils import set_weight_attrs if TYPE_CHECKING: - from sglang.srt.layers.moe import MoeRunnerConfig from sglang.srt.layers.moe.token_dispatcher import ( CombineInput, StandardDispatchOutput, diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index 7fbdcff31e85..001da6e849a0 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -6,9 +6,6 @@ import torch from sglang.srt.environ import envs -from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import( - NPUCompressedTensorsW4A16Int4DynamicMoEMethod, -) from sglang.srt.layers import deep_gemm_wrapper from sglang.srt.layers.moe import ( get_deepep_mode, @@ -22,6 +19,9 @@ ) from sglang.srt.layers.moe.topk import TopKOutput from sglang.srt.layers.quantization.base_config import QuantizationConfig +from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import ( + NPUCompressedTensorsW4A16Int4DynamicMoEMethod, +) from sglang.srt.layers.quantization.fp8 import Fp8Config from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index 107ec12a8b11..c515018a100c 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -451,7 +451,7 @@ def _is_wNa16_group_channel( def _is_dynamic_token_w4( self, weight_quant: BaseModel, input_quant: BaseModel - ) -> bool: + ) -> bool: is_w4 = weight_quant.num_bits == 4 weight_strategy = ( weight_quant.strategy == QuantizationStrategy.TENSOR.value @@ -538,13 +538,13 @@ def _get_scheme_from_parts( strategy=weight_quant.strategy, is_static_input_scheme=True, input_symmetric=input_quant.symmetric, - ) + ) elif _is_npu: return NPUCompressedTensorsW8A8Int8( - strategy=weight_quant.strategy, - is_static_input_scheme=True, - input_symmetric=input_quant.symmetric, - ) + strategy=weight_quant.strategy, + is_static_input_scheme=True, + input_symmetric=input_quant.symmetric, + ) if self._is_dynamic_token_w8a8(weight_quant, input_quant): if _is_cuda: @@ -560,8 +560,6 @@ def _get_scheme_from_parts( input_symmetric=input_quant.symmetric, ) - - raise NotImplementedError("No compressed-tensors compatible scheme was found.") def get_scheme( diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index daedb41c3c37..157adfd5b88e 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -11,14 +11,14 @@ from compressed_tensors import CompressionFormat from compressed_tensors.quantization import QuantizationStrategy -from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( - NPUW8A8Int8DynamicMoEMethod, - NPUW4A16Int4DynamicMoEMethod, -) from sglang.srt.distributed import get_tensor_model_parallel_world_size, get_tp_group from sglang.srt.distributed.device_communicators.pynccl_allocator import ( use_symmetric_memory, ) +from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( + NPUW4A16Int4DynamicMoEMethod, + NPUW8A8Int8DynamicMoEMethod, +) from sglang.srt.layers.dp_attention import is_allocation_symmetric from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType @@ -88,7 +88,7 @@ class GPTQMarlinState(Enum): "CompressedTensorsW8A8Fp8MoEMethod", "NPUCompressedTensorsW8A8Int8MoEMethod", "CompressedTensorsWNA16MoEMethod", - "NPUCompressedTensorsW4A16Int4DynamicMoEMethod" + "NPUCompressedTensorsW4A16Int4DynamicMoEMethod", ] @@ -115,8 +115,13 @@ def get_moe_method( logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod") return CompressedTensorsWNA16MoEMethod(quant_config) elif _is_npu: - if quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is None: - logger.info_once("Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod") + if ( + quant_config._is_dynamic_token_w4(weight_quant, input_quant) + and input_quant is None + ): + logger.info_once( + "Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod" + ) return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod") diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md index c4a5a8b1f14f..d02d8f3b028f 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/README.md +++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md @@ -5,7 +5,7 @@ Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelsl MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as: - [x] W4A4 dynamic linear - [x] W8A8 static linear -- [x] W8A8 dynamic linear +- [x] W8A8 dynamic linear - [x] W4A8 dynamic MOE - [x] W8A8 dynamic MOE diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index e580fc3df306..e7d7f6c3c745 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -141,7 +141,7 @@ def create_weights( ) layer.register_parameter("w2_weight_offset", w2_weight_offset) set_weight_attrs(w2_weight_offset, extra_weight_attrs) - + # >>> special param for w4a8 if not self.is_per_channel_weight: w13_weight_scale_second = torch.nn.Parameter( diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py index c462b2a66bea..8250c7c4c576 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py @@ -25,7 +25,9 @@ def __init__( prefix: str, ): self.quant_config = quant_config - self.is_dynamic = self.quant_config.get(prefix + ".weight", "") == "W8A8_DYNAMIC" + self.is_dynamic = ( + self.quant_config.get(prefix + ".weight", "") == "W8A8_DYNAMIC" + ) def create_weights( self, From 17a62487eff94cd4144a14bc7e58dd1cf1fcc0b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 20:19:43 +0300 Subject: [PATCH 077/175] Update fused_moe_method_npu.py --- .../npu/quantization/fused_moe_method_npu.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 17da9aaeadea..1e6ee311cd42 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -256,11 +256,14 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase): @classmethod def process_scale(cls, weight: torch.Tensor, scale, per_group_scale): scale = scale.transpose(1, 2).contiguous() - if cls.is_per_channel_weight: + + ### TODO fix group_size=0 behaivor + '''if cls.is_per_channel_weight: scale_np = scale.cpu().numpy() scale_np.dtype = np.uint32 scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu() - return scale_uint64_tensor, None + return scale_uint64_tensor, None''' + per_group_scale = per_group_scale.transpose(1, 2).contiguous() group_num, k, n = weight.shape # the weight of the new version is reduced by half by pack n, so it needs to be restored From 0bf3389d8b1a0bd3df0f867c312b273fd6787013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 20:23:10 +0300 Subject: [PATCH 078/175] Fix missprint --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 1e6ee311cd42..fb39bca777f0 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -257,7 +257,7 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase): def process_scale(cls, weight: torch.Tensor, scale, per_group_scale): scale = scale.transpose(1, 2).contiguous() - ### TODO fix group_size=0 behaivor + ### TODO fix group_size=0 behavior '''if cls.is_per_channel_weight: scale_np = scale.cpu().numpy() scale_np.dtype = np.uint32 From 1d2815795d759aa7f72fb8d1a8c9fc477ef9d059 Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Thu, 18 Dec 2025 20:28:51 +0300 Subject: [PATCH 079/175] Pre-commit fixes --- .../npu/quantization/fused_moe_method_npu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index fb39bca777f0..0c9a6940ec0a 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -258,12 +258,12 @@ def process_scale(cls, weight: torch.Tensor, scale, per_group_scale): scale = scale.transpose(1, 2).contiguous() ### TODO fix group_size=0 behavior - '''if cls.is_per_channel_weight: + """if cls.is_per_channel_weight: scale_np = scale.cpu().numpy() scale_np.dtype = np.uint32 scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu() - return scale_uint64_tensor, None''' - + return scale_uint64_tensor, None""" + per_group_scale = per_group_scale.transpose(1, 2).contiguous() group_num, k, n = weight.shape # the weight of the new version is reduced by half by pack n, so it needs to be restored From a5b88e9b9861df1dad0b31d417d5d22dbcace79a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 22:42:55 +0300 Subject: [PATCH 080/175] Update ascend_npu_quantization.md --- docs/platforms/ascend_npu_quantization.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md index 08623f6ffba3..8f663a9a60af 100644 --- a/docs/platforms/ascend_npu_quantization.md +++ b/docs/platforms/ascend_npu_quantization.md @@ -9,9 +9,11 @@ MsModelSlim on Ascend support: - [x] W4A8 dynamic MOE - [x] W8A8 dynamic MOE -AWQ on Ascend support: +AWQ on Ascend support: - [x] W4A16 linear -- [x] W8A16 MOE +- [x] W8A16 linear # Test required +- [x] W4A16 MOE # Test required +- [x] W8A16 MOE # Test required Compressed-tensors (LLM Compressor) on Ascend support: - [x] W8A8 dynamic linear From 30f7b10bdc6c04c3d7fc54edaccacaed1ea453f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 18 Dec 2025 22:44:55 +0300 Subject: [PATCH 081/175] Update ascend_npu_quantization.md --- docs/platforms/ascend_npu_quantization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md index 8f663a9a60af..860ad950f259 100644 --- a/docs/platforms/ascend_npu_quantization.md +++ b/docs/platforms/ascend_npu_quantization.md @@ -9,7 +9,7 @@ MsModelSlim on Ascend support: - [x] W4A8 dynamic MOE - [x] W8A8 dynamic MOE -AWQ on Ascend support: +AWQ on Ascend support: - [x] W4A16 linear - [x] W8A16 linear # Test required - [x] W4A16 MOE # Test required From 22c85ce1d467c72d552bf776104ef1a567622972 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Fri, 19 Dec 2025 10:56:36 +0300 Subject: [PATCH 082/175] Update python/sglang/srt/configs/model_config.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- python/sglang/srt/configs/model_config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index e9f479f4a33b..c989d9fddf92 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -732,11 +732,11 @@ def _verify_quantization(self) -> None: # Filter out None values cfg_list = [item for item in cfg_list if item is not None] - assert ( - len(cfg_list) == 1 - ), "Config list contains configs from 2 methods, must be only 1" - - quant_cfg = cfg_list[0] + if len(cfg_list) > 1: + raise ValueError( + "Config list contains configs from 2 methods, must be only 1" + ) + quant_cfg = cfg_list[0] if cfg_list else None if quant_cfg is not None: quant_method = quant_cfg.get( From 21b9219b8b5693a80817386bde9051e024daf67c Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Fri, 19 Dec 2025 12:48:41 +0300 Subject: [PATCH 083/175] Update compressed_tensors.py Review fix 1/5 --- .../compressed_tensors/compressed_tensors.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index c515018a100c..8a41fe3fa7b1 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -44,7 +44,7 @@ CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16, - GPUCompressedTensorsW8A8Int8, + CompressedTensorsW8A8Int8, NPUCompressedTensorsW8A8Int8, ) from sglang.srt.layers.quantization.compressed_tensors.utils import ( @@ -533,13 +533,13 @@ def _get_scheme_from_parts( ) if self._is_static_tensor_w8a8(weight_quant, input_quant): - if _is_cuda: - return GPUCompressedTensorsW8A8Int8( + if not _is_npu: + return CompressedTensorsW8A8Int8( strategy=weight_quant.strategy, is_static_input_scheme=True, input_symmetric=input_quant.symmetric, ) - elif _is_npu: + else: return NPUCompressedTensorsW8A8Int8( strategy=weight_quant.strategy, is_static_input_scheme=True, @@ -547,13 +547,13 @@ def _get_scheme_from_parts( ) if self._is_dynamic_token_w8a8(weight_quant, input_quant): - if _is_cuda: - return GPUCompressedTensorsW8A8Int8( + if not _is_npu: + return CompressedTensorsW8A8Int8( strategy=weight_quant.strategy, is_static_input_scheme=False, input_symmetric=input_quant.symmetric, ) - elif _is_npu: + else: return NPUCompressedTensorsW8A8Int8( strategy=weight_quant.strategy, is_static_input_scheme=False, From 52b10881814d670ae2fd3f083244801fabb29266 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Fri, 19 Dec 2025 12:49:10 +0300 Subject: [PATCH 084/175] Update compressed_tensors_moe.py Review fix 2/5 --- .../compressed_tensors_moe.py | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 157adfd5b88e..5cd2ec12792a 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -111,17 +111,12 @@ def get_moe_method( input_quant = quant_config.target_scheme_map["Linear"].get("input_activations") if quant_config._is_wNa16_group_channel(weight_quant, input_quant): - if _is_cuda or _is_hip: + if not _is_npu: logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod") return CompressedTensorsWNA16MoEMethod(quant_config) - elif _is_npu: - if ( - quant_config._is_dynamic_token_w4(weight_quant, input_quant) - and input_quant is None - ): - logger.info_once( - "Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod" - ) + else: + if quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is None: + logger.info_once("Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod") return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod") @@ -887,10 +882,7 @@ def __init__(self, quant_config: CompressedTensorsConfig): self.input_quant = self.quant_config.target_scheme_map["Linear"].get( "input_activations" ) - if not _is_npu: - raise NotImplementedError( - "w8a8 int8 compressed tensors moe scheme is supported only for Ascend device for now." - ) + self.static_input_scales = not self.input_quant.dynamic per_channel = ( self.weight_quant.strategy == QuantizationStrategy.CHANNEL @@ -1314,6 +1306,9 @@ def __init__(self, quantization_config) -> None: else: self.group_size = 128 + # TODO: See if we can merge this method's logic + # with CompressedTensorsWNA16MoEMethod. Need more models and tests. + # @OrangeRedeng @TamirBaydasov def create_weights( self, layer: torch.nn.Module, From 2a5f7457ca840c05e9595499fa2c1935fdab4c18 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Fri, 19 Dec 2025 12:49:49 +0300 Subject: [PATCH 085/175] Update __init__.py Review fix 3/5 --- .../quantization/compressed_tensors/schemes/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py index baf528fea204..70ca328c8a91 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py @@ -4,7 +4,7 @@ from .compressed_tensors_w4a4_nvfp4 import CompressedTensorsW4A4Fp4 from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8 from .compressed_tensors_w8a8_int8 import ( - GPUCompressedTensorsW8A8Int8, + CompressedTensorsW8A8Int8, NPUCompressedTensorsW8A8Int8, ) from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8 @@ -14,7 +14,7 @@ "CompressedTensorsScheme", "CompressedTensorsW8A8Fp8", "CompressedTensorsW8A16Fp8", - "GPUCompressedTensorsW8A8Int8", + "CompressedTensorsW8A8Int8", "NPUCompressedTensorsW8A8Int8", "CompressedTensorsWNA16", "WNA16_SUPPORTED_BITS", From 309e5efdb3db2389dfbc44b8c35ac4f1fbd86189 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Fri, 19 Dec 2025 12:50:23 +0300 Subject: [PATCH 086/175] Update compressed_tensors_w8a8_int8.py Review fix 4/5 --- .../schemes/compressed_tensors_w8a8_int8.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index d307f6b01c33..6db89e9f1ac2 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -91,14 +91,6 @@ def create_weights( ) layer.register_parameter("input_zero_point", input_zero_point) - -class GPUCompressedTensorsW8A8Int8(CompressedTensorsW8A8Int8): - - def __init__( - self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool - ): - super().__init__(strategy, is_static_input_scheme, input_symmetric) - @classmethod def get_min_capability(cls) -> int: # ampere and up From 611546d32cf8326861c93e1b17beedc9765289db Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Fri, 19 Dec 2025 12:51:27 +0300 Subject: [PATCH 087/175] Update README.md Review fix 5/5 --- python/sglang/srt/layers/quantization/msmodelslim/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md index d02d8f3b028f..65f5eb029323 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/README.md +++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md @@ -1,6 +1,6 @@ Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module. -`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config. +`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights. For models quantized with MSModelSlim, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config. MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as: - [x] W4A4 dynamic linear From d2888fdd86bc9c3bd240bda1ca2b84b83d2b7a90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 19 Dec 2025 14:48:49 +0300 Subject: [PATCH 088/175] Update linear_method_npu.py --- .../srt/hardware_backend/npu/quantization/linear_method_npu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py index 2d70834caf0b..6ab0d35652d1 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py @@ -134,7 +134,6 @@ def apply( def process_weights_after_loading(layer): layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() layer.weight_scale.data = layer.weight_scale.data.flatten() - layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32) layer.weight_offset.data = layer.weight_offset.data.flatten() layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack( layer.weight.data.to(torch.int32) From 554027a6db0b305286e5c8bdb412a83ca333ce05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 19 Dec 2025 15:02:34 +0300 Subject: [PATCH 089/175] Fix group_size --- .../npu/quantization/fused_moe_method_npu.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 0c9a6940ec0a..13d72581ff81 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -254,15 +254,14 @@ def apply_without_routing_weights( class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase): @classmethod - def process_scale(cls, weight: torch.Tensor, scale, per_group_scale): + def process_scale(cls, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight): scale = scale.transpose(1, 2).contiguous() - ### TODO fix group_size=0 behavior - """if cls.is_per_channel_weight: + if is_per_channel_weight: scale_np = scale.cpu().numpy() scale_np.dtype = np.uint32 scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu() - return scale_uint64_tensor, None""" + return scale_uint64_tensor, None per_group_scale = per_group_scale.transpose(1, 2).contiguous() group_num, k, n = weight.shape @@ -306,7 +305,7 @@ def pack_to_int32(cls, weight: torch.Tensor): return weight.view(torch.int32).contiguous() @classmethod - def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: + def process_weights_after_loading(cls, layer: torch.nn.Module, is_per_channel_weight) -> None: layer.w13_weight = torch.nn.Parameter( layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False ) @@ -325,10 +324,10 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: else None ) layer.w13_weight_scale.data, w13_bias = cls.process_scale( - layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second + layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second, is_per_channel_weight ) layer.w2_weight_scale.data, w2_bias = cls.process_scale( - layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second + layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second, is_per_channel_weight ) if hasattr(layer, "w13_weight_scale_second"): # scale_second is no longer used, release this part of the memory From ad52cda6d3882cc9940916168d3985be4b81590a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 19 Dec 2025 15:03:26 +0300 Subject: [PATCH 090/175] Fix group_size --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index e7d7f6c3c745..2729c8e0d477 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -212,7 +212,7 @@ def create_weights( set_weight_attrs(w2_scale_bias, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(layer) + NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(layer, self.is_per_channel_weight) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" From 1d0eddb200441f03627ff9e5153f8592b04ab1d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 19 Dec 2025 15:29:20 +0300 Subject: [PATCH 091/175] Update fused_moe_method_npu.py --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 13d72581ff81..830d5ca2fa11 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -362,8 +362,8 @@ def apply( num_tokens = hidden_states.shape[:-1].numel() first_expert_idx = 0 - last_expert_idx = 128 - global_num_experts = 128 + last_expert_idx = layer.num_experts + global_num_experts = layer.num_experts sorted_hidden_states, expanded_row_idx, expert_tokens, pertoken_scale = ( torch.ops.npu.npu_moe_init_routing_v2( From c2e972fdb8a5440b6077898bd1587b42a8554472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 22 Dec 2025 10:31:50 +0300 Subject: [PATCH 092/175] Update fused_moe_method_npu.py --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 830d5ca2fa11..3c4063b21fe7 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -384,7 +384,6 @@ def apply( bias2 = [layer.w2_scale_bias] w1_scale = [layer.w13_weight_scale] w2_scale = [layer.w2_weight_scale] - # TODO w4a8 scene: dynamic acquisition of dtype in the future _output_dtype = torch.bfloat16 hidden_states = torch.ops.npu.npu_grouped_matmul( From 3bc7fafd905dca83fc8a69d2ce60edc75fcea2d9 Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Mon, 22 Dec 2025 10:32:26 +0300 Subject: [PATCH 093/175] Pre-commit fixes --- .../npu/quantization/fused_moe_method_npu.py | 18 ++++++++++++++---- .../compressed_tensors/compressed_tensors.py | 2 +- .../compressed_tensors_moe.py | 9 +++++++-- .../msmodelslim/msmodelslim_moe.py | 4 +++- 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 3c4063b21fe7..4a7f2e22845d 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -254,7 +254,9 @@ def apply_without_routing_weights( class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase): @classmethod - def process_scale(cls, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight): + def process_scale( + cls, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight + ): scale = scale.transpose(1, 2).contiguous() if is_per_channel_weight: @@ -305,7 +307,9 @@ def pack_to_int32(cls, weight: torch.Tensor): return weight.view(torch.int32).contiguous() @classmethod - def process_weights_after_loading(cls, layer: torch.nn.Module, is_per_channel_weight) -> None: + def process_weights_after_loading( + cls, layer: torch.nn.Module, is_per_channel_weight + ) -> None: layer.w13_weight = torch.nn.Parameter( layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False ) @@ -324,10 +328,16 @@ def process_weights_after_loading(cls, layer: torch.nn.Module, is_per_channel_we else None ) layer.w13_weight_scale.data, w13_bias = cls.process_scale( - layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second, is_per_channel_weight + layer.w13_weight, + layer.w13_weight_scale.data, + w13_weight_scale_second, + is_per_channel_weight, ) layer.w2_weight_scale.data, w2_bias = cls.process_scale( - layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second, is_per_channel_weight + layer.w2_weight, + layer.w2_weight_scale.data, + w2_weight_scale_second, + is_per_channel_weight, ) if hasattr(layer, "w13_weight_scale_second"): # scale_second is no longer used, release this part of the memory diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index 8a41fe3fa7b1..0ed642950fbc 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -42,9 +42,9 @@ CompressedTensorsScheme, CompressedTensorsW4A4Fp4, CompressedTensorsW8A8Fp8, + CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16, - CompressedTensorsW8A8Int8, NPUCompressedTensorsW8A8Int8, ) from sglang.srt.layers.quantization.compressed_tensors.utils import ( diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 5cd2ec12792a..39d53e88ee3f 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -115,8 +115,13 @@ def get_moe_method( logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod") return CompressedTensorsWNA16MoEMethod(quant_config) else: - if quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is None: - logger.info_once("Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod") + if ( + quant_config._is_dynamic_token_w4(weight_quant, input_quant) + and input_quant is None + ): + logger.info_once( + "Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod" + ) return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod") diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 2729c8e0d477..5cec89a39773 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -212,7 +212,9 @@ def create_weights( set_weight_attrs(w2_scale_bias, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(layer, self.is_per_channel_weight) + NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading( + layer, self.is_per_channel_weight + ) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" From ff1f793736f7bb27d51601061c5e112d6ef80989 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 22 Dec 2025 12:34:23 +0300 Subject: [PATCH 094/175] Fix Qwen3-32B AWQ issue --- python/sglang/srt/layers/quantization/awq.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/layers/quantization/awq.py b/python/sglang/srt/layers/quantization/awq.py index 5497900a0ce3..69f192840467 100644 --- a/python/sglang/srt/layers/quantization/awq.py +++ b/python/sglang/srt/layers/quantization/awq.py @@ -627,8 +627,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: qzeros_tmp = -(qzeros_tmp - 8) qzeros_tmp = qzeros_tmp.to(layer.scales.data.dtype) - layer.qzeros = torch.nn.Parameter(qzeros_tmp, requires_grad=False) - layer.qweight = torch.nn.Parameter(qweight_tmp, requires_grad=False) + layer.zeros = torch.nn.Parameter(qzeros_tmp, requires_grad=False) + layer.weight = torch.nn.Parameter(qweight_tmp, requires_grad=False) def apply( self, @@ -636,9 +636,9 @@ def apply( x: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - qweight = layer.qweight + qweight = layer.weight scales = layer.scales - qzeros = layer.qzeros + qzeros = layer.zeros pack_factor = self.quant_config.pack_factor out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,) reshaped_x = x.reshape(-1, x.shape[-1]) From 7cbf9645ee185db010805f13534bf2f43c5b95e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 22 Dec 2025 13:17:31 +0300 Subject: [PATCH 095/175] Update ascend_npu_quantization.md --- docs/platforms/ascend_npu_quantization.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md index 860ad950f259..172b5e295fb6 100644 --- a/docs/platforms/ascend_npu_quantization.md +++ b/docs/platforms/ascend_npu_quantization.md @@ -13,7 +13,6 @@ AWQ on Ascend support: - [x] W4A16 linear - [x] W8A16 linear # Test required - [x] W4A16 MOE # Test required -- [x] W8A16 MOE # Test required Compressed-tensors (LLM Compressor) on Ascend support: - [x] W8A8 dynamic linear From 7b20ccf164cd563151a1a4ea839d238f621ec91b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 22 Dec 2025 13:19:07 +0300 Subject: [PATCH 096/175] Update ascend_npu_quantization.md From e1cabfa2faf521329282484fe75e5a83776f9af3 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Mon, 22 Dec 2025 17:07:47 +0300 Subject: [PATCH 097/175] Update fused_moe_method_npu.py Static method removal commit 1/9 --- .../npu/quantization/fused_moe_method_npu.py | 87 +++++++++---------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 4a7f2e22845d..f1ee05f2584c 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import numpy as np import torch @@ -11,6 +11,7 @@ CombineInput, StandardDispatchOutput, ) + from sglang.srt.layers.quantization.base_config import QuantizationConfig def npu_fused_experts( @@ -138,22 +139,29 @@ def npu_fused_moe_without_routing_weights_bf16( return hidden_states -class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase): +class _NPUFusedMoEMethodBase(FusedMoEMethodBase): - @classmethod - def release_weight_cache(cls, weight: torch.Tensor): + def __init__( + self, + quant_config: Optional["QuantizationConfig"] = None, + ): + self.quant_config = quant_config + + +class NPUW8A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase): + + def _release_weight_cache(self, weight: torch.Tensor): # .contiguous() introduces additional memory overhead and needs to be released using resize_(0) origin_weight = weight.data.transpose(1, 2) new_weight = origin_weight.contiguous() origin_weight.untyped_storage().resize_(0) return new_weight - @classmethod - def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: - weight_data = cls.release_weight_cache(layer.w13_weight.data) + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + weight_data = self._release_weight_cache(layer.w13_weight.data) layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False) - weight_data = cls.release_weight_cache(layer.w2_weight.data) + weight_data = self._release_weight_cache(layer.w2_weight.data) layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False) layer.w13_weight_scale = torch.nn.Parameter( @@ -178,8 +186,8 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) - @staticmethod def apply( + self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": @@ -203,8 +211,8 @@ def apply( ) return StandardCombineInput(hidden_states=output) - @staticmethod def apply_without_routing_weights( + self, layer, hidden_states, hidden_states_scale, @@ -251,12 +259,9 @@ def apply_without_routing_weights( return hidden_states -class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase): +class NPUW4A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase): - @classmethod - def process_scale( - cls, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight - ): + def _process_scale(self, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight): scale = scale.transpose(1, 2).contiguous() if is_per_channel_weight: @@ -289,8 +294,7 @@ def process_scale( sscale_uint64_tensor = sscale_uint64_tensor.npu() return sscale_uint64_tensor, bias - @classmethod - def update_bias(cls, layer, w13_bias, w2_bias): + def _update_bias(self, layer, w13_bias, w2_bias): layer.w13_scale_bias.data = ( layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) ) @@ -298,17 +302,15 @@ def update_bias(cls, layer, w13_bias, w2_bias): layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1) ) - @classmethod - def pack_to_int32(cls, weight: torch.Tensor): + def _pack_to_int32(self, weight: torch.Tensor): # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4 assert ( weight.shape[-1] % 4 == 0 ), "the last dim of weight needs to be divided by 4" return weight.view(torch.int32).contiguous() - @classmethod def process_weights_after_loading( - cls, layer: torch.nn.Module, is_per_channel_weight + self, layer: torch.nn.Module, is_per_channel_weight ) -> None: layer.w13_weight = torch.nn.Parameter( layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False @@ -327,17 +329,17 @@ def process_weights_after_loading( if hasattr(layer, "w2_weight_scale_second") else None ) - layer.w13_weight_scale.data, w13_bias = cls.process_scale( + layer.w13_weight_scale.data, w13_bias = self._process_scale( layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second, - is_per_channel_weight, + is_per_channel_weight ) - layer.w2_weight_scale.data, w2_bias = cls.process_scale( + layer.w2_weight_scale.data, w2_bias = self._process_scale( layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second, - is_per_channel_weight, + is_per_channel_weight ) if hasattr(layer, "w13_weight_scale_second"): # scale_second is no longer used, release this part of the memory @@ -346,15 +348,15 @@ def process_weights_after_loading( del layer.w13_weight_offset_second del layer.w2_weight_offset_second - cls.update_bias(layer, w13_bias, w2_bias) + self._update_bias(layer, w13_bias, w2_bias) layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) - layer.w13_weight.data = cls.pack_to_int32(layer.w13_weight.data) - layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data) + layer.w13_weight.data = self._pack_to_int32(layer.w13_weight.data) + layer.w2_weight.data = self._pack_to_int32(layer.w2_weight.data) - @staticmethod def apply( + self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": @@ -437,8 +439,8 @@ def apply( return StandardCombineInput(hidden_states=final_hidden_states) - @staticmethod def apply_without_routing_weights( + self, layer, hidden_states, hidden_states_scale, @@ -481,10 +483,9 @@ def apply_without_routing_weights( return hidden_states -class NPUW4A16Int4DynamicMoEMethod(FusedMoEMethodBase): +class NPUW4A16Int4DynamicMoEMethod(_NPUFusedMoEMethodBase): - @classmethod - def pack_to_int32(cls, weight: torch.Tensor): + def _pack_to_int32(self, weight: torch.Tensor): assert weight.dim() == 3 if weight.dtype == torch.int32: # pack 8 int4 to int32, we use a int32 to represent a int4 @@ -505,9 +506,8 @@ def pack_to_int32(cls, weight: torch.Tensor): raise ValueError(f"{weight.dtype=} is not supported !") return new_weight - @classmethod - def unpack_from_int32( - cls, + def _unpack_from_int32( + self, value: torch.Tensor, num_bits: int, shape: torch.Size = None, @@ -570,8 +570,7 @@ def unpack_from_int32( return unpacked - @classmethod - def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: w13_weight_scale = layer.w13_weight_scale.data.transpose(-1, -2).contiguous() w2_weight_scale = layer.w2_weight_scale.data.transpose(-1, -2).contiguous() layer.w13_weight_scale = torch.nn.Parameter( @@ -592,28 +591,28 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None: # w13_weight = layer.w13_weight.data.transpose(1, 2).contiguous() # w2_weight = layer.w2_weight.data.transpose(1, 2).contiguous() unpacked_w13_weight = ( - cls.unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4) + self._unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4) .view(layer.w13_weight.data.shape[0], layer.w13_weight.data.shape[1], -1) .transpose(1, 2) .contiguous() .int() ) unpacked_w2_weight = ( - cls.unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4) + self._unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4) .view(layer.w2_weight.data.shape[0], layer.w2_weight.data.shape[1], -1) .transpose(1, 2) .contiguous() .int() ) - w13_weight = cls.pack_to_int32(unpacked_w13_weight) - w2_weight = cls.pack_to_int32(unpacked_w2_weight) + w13_weight = self._pack_to_int32(unpacked_w13_weight) + w2_weight = self._pack_to_int32(unpacked_w2_weight) layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) - @staticmethod def apply( + self, layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": @@ -640,8 +639,8 @@ def apply( ) return StandardCombineInput(hidden_states=output) - @staticmethod def apply_without_routing_weights( + self, layer, hidden_states, hidden_states_scale, From 734ab1dc712593e08e9eb28c54c467f87f77f14e Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Mon, 22 Dec 2025 17:08:27 +0300 Subject: [PATCH 098/175] Update linear_method_npu.py Static method removal 2/9 --- .../npu/quantization/linear_method_npu.py | 81 +++++++++---------- 1 file changed, 39 insertions(+), 42 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py index 6ab0d35652d1..3a99f6ac7c3b 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py @@ -15,14 +15,36 @@ def __init__( self, quant_config: Optional["QuantizationConfig"] = None, ): - super().__init__() self.quant_config = quant_config class NPUW8A8Int8LinearMethod(_NPULinearMethodBase): - @staticmethod + def process_weights_after_loading(self, layer: torch.nn.Module): + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight.data = npu_format_cast(layer.weight.data) + + layer.weight_scale.data = layer.weight_scale.data.flatten() + # Compressed-tensors format doesn't have this field + if hasattr(layer, "weight_offset"): + layer.weight_offset.data = layer.weight_offset.data.flatten() + + expanding_factor = layer.weight.data.shape[0] + layer.aclnn_input_scale = torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + layer.aclnn_input_offset = torch.nn.Parameter( + layer.input_offset.data.repeat(expanding_factor).to(device="npu"), + requires_grad=False, + ) + def apply( + self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None, @@ -53,8 +75,10 @@ def apply( output_dtype=original_dtype, ) - @staticmethod - def process_weights_after_loading(layer: torch.nn.Module): + +class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase): + + def process_weights_after_loading(self, layer: torch.nn.Module): layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() layer.weight.data = npu_format_cast(layer.weight.data) @@ -63,25 +87,8 @@ def process_weights_after_loading(layer: torch.nn.Module): if hasattr(layer, "weight_offset"): layer.weight_offset.data = layer.weight_offset.data.flatten() - expanding_factor = layer.weight.data.shape[0] - layer.aclnn_input_scale = torch.nn.Parameter( - layer.input_scale.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) - layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter( - layer.input_scale.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) - layer.aclnn_input_offset = torch.nn.Parameter( - layer.input_offset.data.repeat(expanding_factor).to(device="npu"), - requires_grad=False, - ) - - -class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase): - - @staticmethod def apply( + self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None, @@ -97,21 +104,20 @@ def apply( output_dtype=original_dtype, ) - @staticmethod - def process_weights_after_loading(layer: torch.nn.Module): - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - layer.weight.data = npu_format_cast(layer.weight.data) - - layer.weight_scale.data = layer.weight_scale.data.flatten() - # Compressed-tensors format doesn't have this field - if hasattr(layer, "weight_offset"): - layer.weight_offset.data = layer.weight_offset.data.flatten() - class NPU_W4A4DynamicLinearMethod(_NPULinearMethodBase): - @staticmethod + def process_weights_after_loading(self, layer): + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight_scale.data = layer.weight_scale.data.flatten() + layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32) + layer.weight_offset.data = layer.weight_offset.data.flatten() + layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack( + layer.weight.data.to(torch.int32) + ) + def apply( + self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None, @@ -129,12 +135,3 @@ def apply( bias=bias, output_dtype=original_dtype, ) - - @staticmethod - def process_weights_after_loading(layer): - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - layer.weight_scale.data = layer.weight_scale.data.flatten() - layer.weight_offset.data = layer.weight_offset.data.flatten() - layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack( - layer.weight.data.to(torch.int32) - ) From 93533b07bf20e1f4e95de4a7a407ac07d2240b13 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Mon, 22 Dec 2025 17:09:26 +0300 Subject: [PATCH 099/175] Update base_config.py static method removal 3/9 --- python/sglang/srt/layers/quantization/base_config.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/sglang/srt/layers/quantization/base_config.py b/python/sglang/srt/layers/quantization/base_config.py index 8297124cc4c0..3e93cfde7e70 100644 --- a/python/sglang/srt/layers/quantization/base_config.py +++ b/python/sglang/srt/layers/quantization/base_config.py @@ -17,7 +17,6 @@ class QuantizeMethodBase(ABC): """Base class for different quantized methods.""" - @abstractmethod def create_weights( self, layer: torch.nn.Module, *weight_args, **extra_weight_attrs ): @@ -44,7 +43,6 @@ def process_weights_after_loading(self, layer: nn.Module) -> None: class LinearMethodBase(QuantizeMethodBase): """Base class for different (maybe quantized) linear methods.""" - @abstractmethod def create_weights( self, layer: torch.nn.Module, @@ -84,7 +82,6 @@ def apply( class FusedMoEMethodBase(QuantizeMethodBase): - @abstractmethod def create_weights( self, layer: torch.nn.Module, @@ -96,7 +93,6 @@ def create_weights( ): raise NotImplementedError - @abstractmethod def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig ): From 0cd79c63389a9598eaf1fb773deda22d9c80bd2a Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Mon, 22 Dec 2025 17:10:11 +0300 Subject: [PATCH 100/175] Update compressed_tensors_moe.py static method removal 4/9 --- .../compressed_tensors/compressed_tensors_moe.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 39d53e88ee3f..0d876f489bbd 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -887,6 +887,7 @@ def __init__(self, quant_config: CompressedTensorsConfig): self.input_quant = self.quant_config.target_scheme_map["Linear"].get( "input_activations" ) + self.kernel = NPUW8A8Int8DynamicMoEMethod() self.static_input_scales = not self.input_quant.dynamic per_channel = ( @@ -973,7 +974,7 @@ def create_weights( layer.w2_input_scale = None def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - NPUW8A8Int8DynamicMoEMethod.process_weights_after_loading(layer) + self.kernel.process_weights_after_loading(layer) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig @@ -986,7 +987,7 @@ def apply( dispatch_output: StandardDispatchOutput, ) -> CombineInput: - return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output) + return self.kernel.apply(layer, dispatch_output) class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): @@ -1310,6 +1311,8 @@ def __init__(self, quantization_config) -> None: ].group_size else: self.group_size = 128 + + self.kernel = NPUW4A16Int4DynamicMoEMethod() # TODO: See if we can merge this method's logic # with CompressedTensorsWNA16MoEMethod. Need more models and tests. @@ -1412,7 +1415,7 @@ def create_weights( set_weight_attrs(w2_weight_offset, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - NPUW4A16Int4DynamicMoEMethod.process_weights_after_loading(layer) + self.kernel.process_weights_after_loading(layer) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig @@ -1425,7 +1428,7 @@ def apply( dispatch_output: StandardDispatchOutput, ) -> CombineInput: - return NPUW4A16Int4DynamicMoEMethod.apply(layer, dispatch_output) + return self.kernel.apply(layer, dispatch_output) def apply_without_routing_weights( self, @@ -1436,7 +1439,7 @@ def apply_without_routing_weights( group_list, output_dtype, ): - return NPUW4A16Int4DynamicMoEMethod.apply_without_routing_weights( + return self.kernel.apply_without_routing_weights( layer, hidden_states, hidden_states_scale, From a9d4847b80cc094b1a03d6338093a8548ee9ec18 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Mon, 22 Dec 2025 17:12:08 +0300 Subject: [PATCH 101/175] Update compressed_tensors_w8a8_int8.py static method removal 5/9 --- .../schemes/compressed_tensors_w8a8_int8.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 6db89e9f1ac2..efcd4b611fa9 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -188,13 +188,14 @@ def __init__( raise NotImplementedError( "Static compressed-tensors scheme is not yet supported on NPU." ) + self.kernel = NPUW8A8Int8DynamicLinearMethod() @classmethod def get_min_capability(cls) -> int: return NotImplementedError def process_weights_after_loading(self, layer): - return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) + return self.kernel.process_weights_after_loading(layer) def apply_weights(self, layer, x, bias): - return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias) + return self.kernel.apply(layer, x, bias) From af3756b101b09448bb90887725cf6c75242bc7de Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Mon, 22 Dec 2025 17:12:40 +0300 Subject: [PATCH 102/175] Update msmodelslim.py static method removal 6/9 --- .../srt/layers/quantization/msmodelslim/msmodelslim.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py index dc43e6b79b5d..61913209da4a 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py @@ -150,10 +150,6 @@ def get_quant_method( if self.is_layer_skipped(prefix, packed_modules_mapping_subset): return UnquantizedLinearMethod() scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config) - if scheme is None: - raise NotImplementedError( - "At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes." - ) layer.scheme = scheme return ModelSlimLinearMethod(self) elif isinstance(layer, FusedMoE): @@ -174,6 +170,7 @@ def _get_scheme_from_parts( return ModelSlimW4A4Int4( quant_config=self.quant_description, prefix=layer_name ) + raise NotImplementedError("No modelslim compatible scheme was found.") def get_scheme( self, layer: torch.nn.Module, layer_name: Optional[str] = None From 1ddd8d440153675f448f8e390fb0fcfb5bdc6f61 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Mon, 22 Dec 2025 17:13:04 +0300 Subject: [PATCH 103/175] Update msmodelslim_moe.py static method removal 7/9 --- .../msmodelslim/msmodelslim_moe.py | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 5cec89a39773..435ba8a4f945 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -52,11 +52,21 @@ def get_moe_method( quant_config.quant_description.get(prefix_in_quant_config, "STATIC") == "W4A8_DYNAMIC" ) - + is_moe_w8a8_dynamic = ( + quant_config.quant_description.get(prefix_in_quant_config, "STATIC") + == "W8A8_DYNAMIC" + ) if is_moe_w4a8_dynamic: + logger.info_once("Using ModelSlimW4A8Int8MoE") return ModelSlimW4A8Int8MoE(quant_config) - - return ModelSlimW8A8Int8MoE(quant_config) + elif is_moe_w8a8_dynamic: + logger.info_once("Using ModelSlimW8A8Int8MoE") + return ModelSlimW8A8Int8MoE(quant_config) + else: + raise RuntimeError( + f"Unsupported FusedMoe modelslim scheme: \ + {quant_config.quant_description.get(prefix_in_quant_config)}" + ) class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod): @@ -69,6 +79,7 @@ def __init__( self.quant_config = quant_config self.group_size = 0 self.tp_size = 1 + self.kernel = NPUW4A8Int8DynamicMoEMethod() def create_weights( self, @@ -212,9 +223,7 @@ def create_weights( set_weight_attrs(w2_scale_bias, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading( - layer, self.is_per_channel_weight - ) + self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" @@ -226,7 +235,7 @@ def apply( layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": - return NPUW4A8Int8DynamicMoEMethod.apply(layer, dispatch_output) + return self.kernel.apply(layer, dispatch_output) def apply_without_routing_weights( self, @@ -237,7 +246,7 @@ def apply_without_routing_weights( group_list, output_dtype, ): - return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights( + return self.kernel.apply_without_routing_weights( layer, hidden_states, hidden_states_scale, @@ -255,6 +264,7 @@ def __init__( prefix: str = None, ): self.quant_config = quant_config + self.kernel = NPUW8A8Int8DynamicMoEMethod() def create_weights( self, @@ -327,7 +337,7 @@ def create_weights( set_weight_attrs(w2_weight_offset, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - NPUW8A8Int8DynamicMoEMethod.process_weights_after_loading(layer) + self.kernel.process_weights_after_loading(layer) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" @@ -339,7 +349,7 @@ def apply( layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": - return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output) + return self.kernel.apply(layer, dispatch_output) def apply_without_routing_weights( self, @@ -350,7 +360,7 @@ def apply_without_routing_weights( group_list, output_dtype, ): - return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights( + return self.kernel.apply_without_routing_weights( layer, hidden_states, hidden_states_scale, From 76a1e948bd265158fb38ba0f668a0db8e35baa7a Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Mon, 22 Dec 2025 17:13:54 +0300 Subject: [PATCH 104/175] Update msmodelslim_w4a4_int4.py static method removal 8/9 --- .../msmodelslim/schemes/msmodelslim_w4a4_int4.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py index 1b578837c8d4..6fb7561cc438 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py @@ -22,6 +22,7 @@ def __init__( ): self.quant_config = quant_config self.is_dynamic = self.quant_config[prefix + ".weight"] == "W4A4_DYNAMIC" + self.kernel = NPU_W4A4DynamicLinearMethod() @staticmethod def get_weight( @@ -87,7 +88,7 @@ def create_weights( set_weight_attrs(param, extra_weight_attrs) def process_weights_after_loading(self, layer): - NPU_W4A4DynamicLinearMethod.process_weights_after_loading(layer) + self.kernel.process_weights_after_loading(layer) def apply_weights( self, @@ -95,4 +96,4 @@ def apply_weights( x: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - return NPU_W4A4DynamicLinearMethod.apply(layer, x, bias) + return self.kernel.apply(layer, x, bias) From f773ee4b92c10968dd23c344ef3ad1f17baeb7b2 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Mon, 22 Dec 2025 17:14:19 +0300 Subject: [PATCH 105/175] Update msmodelslim_w8a8_int8.py static method removal 9/9 --- .../msmodelslim/schemes/msmodelslim_w8a8_int8.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py index 8250c7c4c576..1e1e99fc174d 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py @@ -28,6 +28,10 @@ def __init__( self.is_dynamic = ( self.quant_config.get(prefix + ".weight", "") == "W8A8_DYNAMIC" ) + if self.is_dynamic: + self.kernel = NPUW8A8Int8DynamicLinearMethod() + else: + self.kernel = NPUW8A8Int8LinearMethod() def create_weights( self, @@ -102,10 +106,7 @@ def create_weights( layer.register_parameter("deq_scale", deq_scale) def process_weights_after_loading(self, layer: torch.nn.Module): - if self.is_dynamic: - NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer) - else: - NPUW8A8Int8LinearMethod.process_weights_after_loading(layer) + self.kernel.process_weights_after_loading(layer) def apply_weights( self, @@ -113,7 +114,4 @@ def apply_weights( x: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - if self.is_dynamic: - return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias) - else: - return NPUW8A8Int8LinearMethod.apply(layer, x, bias) + return self.kernel.apply(layer, x, bias) From a6d161985e53f243eb2a2d2c66ad78d9a216ad70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 23 Dec 2025 11:39:53 +0300 Subject: [PATCH 106/175] Update msmodelslim_moe.py --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 435ba8a4f945..ff8639b7678e 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -235,7 +235,11 @@ def apply( layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": - return self.kernel.apply(layer, dispatch_output) + # FIXME W4A8 without EP gives 0 accuracy + raise NotImplementedError( + f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep" + ) + # return self.kernel.apply(layer, dispatch_output) def apply_without_routing_weights( self, From 789c2468b69609ae5bdeb80ab6aab404584bbbf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 23 Dec 2025 11:45:54 +0300 Subject: [PATCH 107/175] Fix lint issue --- .../npu/quantization/fused_moe_method_npu.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index f1ee05f2584c..05ace7966a79 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -261,7 +261,9 @@ def apply_without_routing_weights( class NPUW4A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase): - def _process_scale(self, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight): + def _process_scale( + self, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight + ): scale = scale.transpose(1, 2).contiguous() if is_per_channel_weight: @@ -333,13 +335,13 @@ def process_weights_after_loading( layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second, - is_per_channel_weight + is_per_channel_weight, ) layer.w2_weight_scale.data, w2_bias = self._process_scale( layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second, - is_per_channel_weight + is_per_channel_weight, ) if hasattr(layer, "w13_weight_scale_second"): # scale_second is no longer used, release this part of the memory From 2cc4db4a3faccb94d867d2075f50f911999805a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 23 Dec 2025 11:46:59 +0300 Subject: [PATCH 108/175] Fix lint issue --- .../quantization/compressed_tensors/compressed_tensors_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 0d876f489bbd..1947e331c740 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1311,7 +1311,7 @@ def __init__(self, quantization_config) -> None: ].group_size else: self.group_size = 128 - + self.kernel = NPUW4A16Int4DynamicMoEMethod() # TODO: See if we can merge this method's logic From 94827ef52e428c8510d8bcca8b46c5be5e5db2ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 23 Dec 2025 11:47:47 +0300 Subject: [PATCH 109/175] Fix lint issue --- .../quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py index 1e1e99fc174d..9986e1976eaf 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py @@ -106,7 +106,7 @@ def create_weights( layer.register_parameter("deq_scale", deq_scale) def process_weights_after_loading(self, layer: torch.nn.Module): - self.kernel.process_weights_after_loading(layer) + self.kernel.process_weights_after_loading(layer) def apply_weights( self, @@ -114,4 +114,4 @@ def apply_weights( x: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - return self.kernel.apply(layer, x, bias) + return self.kernel.apply(layer, x, bias) From 1a30a428fd6bd05237ccbb8d21c26eccfc02aba3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 23 Dec 2025 15:12:24 +0300 Subject: [PATCH 110/175] Change local path to modelscope --- test/srt/ascend/test_ascend_w4a4_quantization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index c2251ec94a9d..fbce7bdd9327 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -21,6 +21,8 @@ popen_launch_server, ) +os.environ['SGLANG_USE_MODELSCOPE'] = 'true' + if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( @@ -32,7 +34,7 @@ class TestAscendW4A4(CustomTestCase): @classmethod def setUpClass(cls): - cls.model = "/root/.cache/modelscope/hub/models/msit/Qwen3-8B-W4A4/" + cls.model = "Eco-Tech/Qwen3-8B-w4a4-QuaRot" cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, From f539100c86df00d36e062f90a14440f7dd43c49d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 23 Dec 2025 15:15:52 +0300 Subject: [PATCH 111/175] Update test_ascend_w4a4_quantization.py --- test/srt/ascend/test_ascend_w4a4_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index fbce7bdd9327..9525b0407a08 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -21,7 +21,7 @@ popen_launch_server, ) -os.environ['SGLANG_USE_MODELSCOPE'] = 'true' +os.environ["SGLANG_USE_MODELSCOPE"] = "true" if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" From 01f6c58a938d526540ef83b76273b61d861cfcd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:02:21 +0300 Subject: [PATCH 112/175] Temporary fix --- test/srt/run_suite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index d09d484339a0..fb093110ae6d 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -370,7 +370,7 @@ TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400), TestFile("ascend/test_ascend_tp2_bf16.py", 400), TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400), - TestFile("ascend/test_ascend_w4a4_quantization.py", 400), + TestFile("ascend/test_ascend_w4a4_quantization.py", 1600), ], "per-commit-4-npu-a2": [ TestFile("ascend/test_ascend_mla_w8a8int8.py", 400), From c9a8122ef04f56f3be258c26f70cd1f5d0140142 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Wed, 24 Dec 2025 11:13:42 +0300 Subject: [PATCH 113/175] Update test_ascend_w8a8_quantization.py --- test/srt/ascend/test_ascend_w8a8_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/ascend/test_ascend_w8a8_quantization.py b/test/srt/ascend/test_ascend_w8a8_quantization.py index f3f9cdff952b..e0b3545701c6 100644 --- a/test/srt/ascend/test_ascend_w8a8_quantization.py +++ b/test/srt/ascend/test_ascend_w8a8_quantization.py @@ -69,7 +69,7 @@ def test_gsm8k(self): print(metrics) self.assertGreaterEqual(metrics["accuracy"], 0.3) - self.assertGreaterEqual(metrics["output_throughput"], 1000) + self.assertGreaterEqual(metrics["output_throughput"], 700) def run_decode(self, max_new_tokens): response = requests.post( From 6bb9f20b45d3f4ff77f48ffe70fc3873ca1db8fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Wed, 24 Dec 2025 11:14:33 +0300 Subject: [PATCH 114/175] Update run_suite.py --- test/srt/run_suite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 5687cbeaf17c..c5397ff0a23f 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -354,7 +354,7 @@ TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400), TestFile("ascend/test_ascend_tp2_bf16.py", 400), TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400), - TestFile("ascend/test_ascend_w4a4_quantization.py", 1600), + TestFile("ascend/test_ascend_w4a4_quantization.py", 400), ], "per-commit-4-npu-a2": [ TestFile("ascend/test_ascend_mla_w8a8int8.py", 400), From 836dc164871bca29058e8e28ebf76de22a9ed5f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Wed, 24 Dec 2025 11:18:41 +0300 Subject: [PATCH 115/175] Update test_ascend_w4a4_quantization.py --- test/srt/ascend/test_ascend_w4a4_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index 9525b0407a08..975424e1883e 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -34,7 +34,7 @@ class TestAscendW4A4(CustomTestCase): @classmethod def setUpClass(cls): - cls.model = "Eco-Tech/Qwen3-8B-w4a4-QuaRot" + cls.model = "/root/.cache/modelscope/hub/models/Eco-Tech/Qwen3-8B-w4a4-QuaRot" cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, From 14b6ab8770aa9116b48cd5792e87011cc13672c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Wed, 24 Dec 2025 16:15:25 +0300 Subject: [PATCH 116/175] Update test_ascend_w4a4_quantization.py --- test/srt/ascend/test_ascend_w4a4_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index 975424e1883e..e1e1c430b7b9 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -75,7 +75,7 @@ def test_gsm8k(self): metrics = run_eval(args) print(metrics) - self.assertGreaterEqual(metrics["accuracy"], 0.75) + self.assertGreaterEqual(metrics["accuracy"], 0.50) self.assertGreaterEqual(metrics["output_throughput"], 700) def run_decode(self, max_new_tokens): From 15040ccb2e6b4c244c4933520de9de60ddddc112 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 25 Dec 2025 14:55:08 +0300 Subject: [PATCH 117/175] Update msmodelslim_moe.py Debug pring for CI --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index ff8639b7678e..f9f677e5d520 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -63,6 +63,7 @@ def get_moe_method( logger.info_once("Using ModelSlimW8A8Int8MoE") return ModelSlimW8A8Int8MoE(quant_config) else: + print(f"Layer: {prefix_in_quant_config}") raise RuntimeError( f"Unsupported FusedMoe modelslim scheme: \ {quant_config.quant_description.get(prefix_in_quant_config)}" From 5a1c7ece5d7bf3b54ec7c8df860538502c47fe28 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Thu, 25 Dec 2025 19:00:39 +0300 Subject: [PATCH 118/175] Update msmodelslim_moe.py ModelSlim unquant MoE layer processing --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index f9f677e5d520..6c71523ff16a 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -63,11 +63,12 @@ def get_moe_method( logger.info_once("Using ModelSlimW8A8Int8MoE") return ModelSlimW8A8Int8MoE(quant_config) else: - print(f"Layer: {prefix_in_quant_config}") - raise RuntimeError( + logger.warning( f"Unsupported FusedMoe modelslim scheme: \ - {quant_config.quant_description.get(prefix_in_quant_config)}" + {quant_config.quant_description.get(prefix_in_quant_config.strip())} \ + in layer: {prefix}" ) + return None class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod): From a26d9e66196f38bc8c43adddc5c38ad9d552ed80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 25 Dec 2025 19:05:07 +0300 Subject: [PATCH 119/175] Update run_suite.py --- test/srt/run_suite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index c5397ff0a23f..952272adbbac 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -354,7 +354,6 @@ TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400), TestFile("ascend/test_ascend_tp2_bf16.py", 400), TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400), - TestFile("ascend/test_ascend_w4a4_quantization.py", 400), ], "per-commit-4-npu-a2": [ TestFile("ascend/test_ascend_mla_w8a8int8.py", 400), @@ -362,6 +361,7 @@ TestFile("ascend/test_ascend_tp4_bf16.py", 400), ], "per-commit-16-npu-a3": [ + TestFile("ascend/test_ascend_w4a4_quantization.py", 400), TestFile("ascend/test_ascend_deepep.py", 400), TestFile("ascend/test_ascend_deepseek_mtp.py", 400), ], From 1d4446639f8af04dcb73d285109a5029e5f00d03 Mon Sep 17 00:00:00 2001 From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com> Date: Fri, 26 Dec 2025 15:09:27 +0300 Subject: [PATCH 120/175] Add modelslim to optimized methods --- python/sglang/srt/configs/model_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 8c532b7a1ffc..327ef0466bcd 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -755,6 +755,7 @@ def _verify_quantization(self) -> None: "w4afp8", "petit_nvfp4", "quark", + "modelslim", ] compatible_quantization_methods = { "modelopt_fp8": ["modelopt"], From 1c888e0202fdbf640c97e7fd821272afc44e3a5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 13:07:56 +0300 Subject: [PATCH 121/175] Update test_ascend_w4a4_quantization.py --- test/srt/ascend/test_ascend_w4a4_quantization.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index e1e1c430b7b9..5164d72eb513 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -24,7 +24,7 @@ os.environ["SGLANG_USE_MODELSCOPE"] = "true" if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3" DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( 7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 ) @@ -34,7 +34,7 @@ class TestAscendW4A4(CustomTestCase): @classmethod def setUpClass(cls): - cls.model = "/root/.cache/modelscope/hub/models/Eco-Tech/Qwen3-8B-w4a4-QuaRot" + cls.model = "/root/.cache/modelscope/hub/models/Eco-Tech/Qwen3-32B-w4a4-LAOS" cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, @@ -47,7 +47,7 @@ def setUpClass(cls): "--attention-backend", "ascend", "--tp-size", - "2", + "4", "--mem-fraction-static", "0.8", "--cuda-graph-bs", @@ -66,7 +66,7 @@ def test_gsm8k(self): args = SimpleNamespace( num_shots=5, data_path=None, - num_questions=128, + num_questions=1319, max_new_tokens=512, parallel=64, host=f"http://{url.hostname}", @@ -75,8 +75,8 @@ def test_gsm8k(self): metrics = run_eval(args) print(metrics) - self.assertGreaterEqual(metrics["accuracy"], 0.50) - self.assertGreaterEqual(metrics["output_throughput"], 700) + self.assertAlmostEqual(metrics["accuracy"], 0.84) + self.assertAlmostEqual(metrics["output_throughput"], 1100) def run_decode(self, max_new_tokens): response = requests.post( @@ -103,7 +103,7 @@ def test_throughput(self): print(f"Throughput: {throughput} tokens/s") if is_in_ci(): - self.assertGreaterEqual(throughput, 25) + self.assertAlmostEqual(throughput, 38) if __name__ == "__main__": From 1830d749b328b12a4b863a0194b16c13abda1e7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 13:24:46 +0300 Subject: [PATCH 122/175] Resolve conflicts 1/2 --- .../npu/quantization/fused_moe_method_npu.py | 262 +----------------- 1 file changed, 6 insertions(+), 256 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index f949ba1261c7..06158d8eb580 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -261,235 +261,9 @@ def apply_without_routing_weights( class NPUW4A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase): -<<<<<<< HEAD def _process_scale( self, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight ): -======= - def __init__(self, activation_use_clip: bool) -> None: - self.group_size = 0 - self.tp_size = 1 - self.activation_use_clip = activation_use_clip - - def create_weights( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ) -> None: - from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported - - self.is_per_channel_weight = self.group_size == 0 - self.num_experts = num_experts - extra_weight_attrs.update( - {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} - ) - - # >> weight - w13_output_size = intermediate_size_per_partition - w2_output_size = hidden_size // 2 - w13_weight = torch.nn.Parameter( - torch.empty(num_experts, w13_output_size, hidden_size, dtype=torch.int8), - requires_grad=False, - ) - layer.register_parameter("w13_weight", w13_weight) - set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter( - torch.empty( - num_experts, - w2_output_size, - intermediate_size_per_partition, - dtype=torch.int8, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight", w2_weight) - set_weight_attrs(w2_weight, extra_weight_attrs) - - # >> scale - weight_scale_dtype = torch.int64 if self.activation_use_clip else torch.float32 - w13_weight_scale = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - 1, - dtype=weight_scale_dtype, - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight_scale", w13_weight_scale) - set_weight_attrs(w13_weight_scale, extra_weight_attrs) - - w2_weight_scale = torch.nn.Parameter( - torch.empty(num_experts, hidden_size, 1, dtype=weight_scale_dtype), - requires_grad=False, - ) - layer.register_parameter("w2_weight_scale", w2_weight_scale) - set_weight_attrs(w2_weight_scale, extra_weight_attrs) - - # >> offset - w13_weight_offset = torch.nn.Parameter( - torch.empty( - num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight_offset", w13_weight_offset) - set_weight_attrs(w13_weight_offset, extra_weight_attrs) - - w2_weight_offset = torch.nn.Parameter( - torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), - requires_grad=False, - ) - layer.register_parameter("w2_weight_offset", w2_weight_offset) - set_weight_attrs(w2_weight_offset, extra_weight_attrs) - - # >>> special param for w4a8 - if self.activation_use_clip: - self._init_activation_clip_params( - layer, - num_experts, - hidden_size, - intermediate_size_per_partition, - extra_weight_attrs, - ) - else: - self._init_extra_scale_params( - layer, - num_experts, - hidden_size, - intermediate_size_per_partition, - extra_weight_attrs, - ) - - def _init_activation_clip_params( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - extra_weight_attrs: dict, - ) -> None: - """ - Initializes bias and alpha parameters for quantization schemes that use activation clipping. - - This helper registers `w13_bias`, `w2_bias`, and `w2_alpha`, which are required to - shift and scale the activations or outputs to compensate for the precision loss - introduced by clamping activations. - """ - w13_bias = torch.nn.Parameter( - torch.ones( - num_experts, 2 * intermediate_size_per_partition, dtype=torch.float - ), - requires_grad=False, - ) - layer.register_parameter("w13_bias", w13_bias) - set_weight_attrs(w13_bias, extra_weight_attrs) - - w2_bias = torch.nn.Parameter( - torch.ones(num_experts, hidden_size, dtype=torch.float), - requires_grad=False, - ) - layer.register_parameter("w2_bias", w2_bias) - set_weight_attrs(w2_bias, extra_weight_attrs) - - w2_alpha = torch.nn.Parameter( - torch.ones(num_experts, dtype=torch.float), requires_grad=False - ) - layer.register_parameter("w2_alpha", w2_alpha) - set_weight_attrs(w2_alpha, extra_weight_attrs) - - def _init_extra_scale_params( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - extra_weight_attrs: dict, - ) -> None: - """ - Initializes additional scaling, offset, and bias parameters for quantization schemes without activation clipping. - - This method registers the following parameters: - 1. Scale Biases: `w13_scale_bias` and `w2_scale_bias`. - 2. Secondary Quantization Params (initialized only for grouped quantization): - `w13_weight_scale_second`, `w13_weight_offset_second`, - `w2_weight_scale_second`, and `w2_weight_offset_second`. - """ - if not self.is_per_channel_weight: - w13_weight_scale_second = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size // self.group_size, - dtype=torch.float32, - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second) - set_weight_attrs(w13_weight_scale_second, extra_weight_attrs) - - w13_weight_offset_second = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size // self.group_size, - dtype=torch.float32, - ), - requires_grad=False, - ) - layer.register_parameter( - "w13_weight_offset_second", w13_weight_offset_second - ) - set_weight_attrs(w13_weight_offset_second, extra_weight_attrs) - - w2_weight_scale_second = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition // self.group_size, - dtype=torch.float32, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight_scale_second", w2_weight_scale_second) - set_weight_attrs(w2_weight_scale_second, extra_weight_attrs) - - w2_weight_offset_second = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition // self.group_size, - dtype=torch.float32, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight_offset_second", w2_weight_offset_second) - set_weight_attrs(w2_weight_offset_second, extra_weight_attrs) - - w13_scale_bias = torch.nn.Parameter( - torch.empty( - num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 - ), - requires_grad=False, - ) - layer.register_parameter("w13_scale_bias", w13_scale_bias) - set_weight_attrs(w13_scale_bias, extra_weight_attrs) - - w2_scale_bias = torch.nn.Parameter( - torch.empty( - num_experts, hidden_size, 16 // self.tp_size, dtype=torch.float32 - ), - requires_grad=False, - ) - layer.register_parameter("w2_scale_bias", w2_scale_bias) - set_weight_attrs(w2_scale_bias, extra_weight_attrs) - - def process_scale(self, weight: torch.Tensor, scale, per_group_scale): ->>>>>>> sglang-main/main scale = scale.transpose(1, 2).contiguous() if is_per_channel_weight: @@ -537,18 +311,12 @@ def _pack_to_int32(self, weight: torch.Tensor): ), "the last dim of weight needs to be divided by 4" return weight.view(torch.int32).contiguous() -<<<<<<< HEAD - def process_weights_after_loading( - self, layer: torch.nn.Module, is_per_channel_weight - ) -> None: -======= - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if not self.activation_use_clip: self._process_weights_without_clip(layer) else: self._process_weights_with_clip(layer) ->>>>>>> sglang-main/main layer.w13_weight = torch.nn.Parameter( layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False ) @@ -573,17 +341,11 @@ def _process_weights_without_clip(self, layer: torch.nn.Module) -> None: if hasattr(layer, "w2_weight_scale_second") else None ) - layer.w13_weight_scale.data, w13_bias = self._process_scale( - layer.w13_weight, - layer.w13_weight_scale.data, - w13_weight_scale_second, - is_per_channel_weight, + layer.w13_weight_scale.data, w13_bias = self.process_scale( + layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second ) - layer.w2_weight_scale.data, w2_bias = self._process_scale( - layer.w2_weight, - layer.w2_weight_scale.data, - w2_weight_scale_second, - is_per_channel_weight, + layer.w2_weight_scale.data, w2_bias = self.process_scale( + layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second ) if hasattr(layer, "w13_weight_scale_second"): # scale_second is no longer used, release this part of the memory @@ -592,14 +354,8 @@ def _process_weights_without_clip(self, layer: torch.nn.Module) -> None: del layer.w13_weight_offset_second del layer.w2_weight_offset_second - self._update_bias(layer, w13_bias, w2_bias) + self.update_bias(layer, w13_bias, w2_bias) -<<<<<<< HEAD - layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) - layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) - layer.w13_weight.data = self._pack_to_int32(layer.w13_weight.data) - layer.w2_weight.data = self._pack_to_int32(layer.w2_weight.data) -======= def _process_weights_with_clip(self, layer: torch.nn.Module) -> None: w13_weight_scale = ( layer.w13_weight_scale.data.squeeze(-1).contiguous().unsqueeze(1) @@ -614,12 +370,6 @@ def _process_weights_with_clip(self, layer: torch.nn.Module) -> None: layer.w13_scale_bias = layer.w13_bias layer.w2_scale_bias = layer.w2_bias - def create_moe_runner( - self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" - ): - self.moe_runner_config = moe_runner_config ->>>>>>> sglang-main/main - def apply( self, layer, From 46a3570dc470a9c37b367f62d168d26c8e35f3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 13:33:56 +0300 Subject: [PATCH 123/175] Resolve conflicts 2/2 --- .../compressed_tensors_moe.py | 263 ++++++++++++++++++ 1 file changed, 263 insertions(+) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 1947e331c740..c9465828ce0a 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -16,6 +16,7 @@ use_symmetric_memory, ) from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import ( + NPUW4A8Int8DynamicMoEMethod, NPUW4A16Int4DynamicMoEMethod, NPUW8A8Int8DynamicMoEMethod, ) @@ -85,6 +86,7 @@ class GPTQMarlinState(Enum): __all__ = [ "CompressedTensorsMoEMethod", "CompressedTensorsW4A4Nvfp4MoEMethod", + "NPUCompressedTensorsW4A8Int8DynamicMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod", "NPUCompressedTensorsW8A8Int8MoEMethod", "CompressedTensorsWNA16MoEMethod", @@ -1298,6 +1300,267 @@ def apply( return StandardCombineInput(hidden_states=output) +class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod): + + def __init__(self, quantization_config) -> None: + self.activation_use_clip = activation_use_clip + self.kernel = NPUW4A8Int8DynamicMoEMethod() + + # TODO: See if we can merge this method's logic + # with CompressedTensorsWNA8MoEMethod. Need more models and tests. + # @OrangeRedeng @TamirBaydasov + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported + + self.is_per_channel_weight = self.group_size == 0 + self.num_experts = num_experts + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} + ) + + # >> weight + w13_output_size = intermediate_size_per_partition + w2_output_size = hidden_size // 2 + w13_weight = torch.nn.Parameter( + torch.empty(num_experts, w13_output_size, hidden_size, dtype=torch.int8), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + w2_output_size, + intermediate_size_per_partition, + dtype=torch.int8, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # >> scale + weight_scale_dtype = torch.int64 if self.activation_use_clip else torch.float32 + w13_weight_scale = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=weight_scale_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + + w2_weight_scale = torch.nn.Parameter( + torch.empty(num_experts, hidden_size, 1, dtype=weight_scale_dtype), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # >> offset + w13_weight_offset = torch.nn.Parameter( + torch.empty( + num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_offset", w13_weight_offset) + set_weight_attrs(w13_weight_offset, extra_weight_attrs) + + w2_weight_offset = torch.nn.Parameter( + torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_weight_offset", w2_weight_offset) + set_weight_attrs(w2_weight_offset, extra_weight_attrs) + + # >>> special param for w4a8 + if self.activation_use_clip: + self._init_activation_clip_params( + layer, + num_experts, + hidden_size, + intermediate_size_per_partition, + extra_weight_attrs, + ) + else: + self._init_extra_scale_params( + layer, + num_experts, + hidden_size, + intermediate_size_per_partition, + extra_weight_attrs, + ) + + def _init_activation_clip_params( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + extra_weight_attrs: dict, + ) -> None: + """ + Initializes bias and alpha parameters for quantization schemes that use activation clipping. + + This helper registers `w13_bias`, `w2_bias`, and `w2_alpha`, which are required to + shift and scale the activations or outputs to compensate for the precision loss + introduced by clamping activations. + """ + w13_bias = torch.nn.Parameter( + torch.ones( + num_experts, 2 * intermediate_size_per_partition, dtype=torch.float + ), + requires_grad=False, + ) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) + + w2_bias = torch.nn.Parameter( + torch.ones(num_experts, hidden_size, dtype=torch.float), + requires_grad=False, + ) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) + + w2_alpha = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float), requires_grad=False + ) + layer.register_parameter("w2_alpha", w2_alpha) + set_weight_attrs(w2_alpha, extra_weight_attrs) + + def _init_extra_scale_params( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + extra_weight_attrs: dict, + ) -> None: + """ + Initializes additional scaling, offset, and bias parameters for quantization schemes without activation clipping. + + This method registers the following parameters: + 1. Scale Biases: `w13_scale_bias` and `w2_scale_bias`. + 2. Secondary Quantization Params (initialized only for grouped quantization): + `w13_weight_scale_second`, `w13_weight_offset_second`, + `w2_weight_scale_second`, and `w2_weight_offset_second`. + """ + if not self.is_per_channel_weight: + w13_weight_scale_second = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // self.group_size, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second) + set_weight_attrs(w13_weight_scale_second, extra_weight_attrs) + + w13_weight_offset_second = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // self.group_size, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter( + "w13_weight_offset_second", w13_weight_offset_second + ) + set_weight_attrs(w13_weight_offset_second, extra_weight_attrs) + + w2_weight_scale_second = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // self.group_size, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale_second", w2_weight_scale_second) + set_weight_attrs(w2_weight_scale_second, extra_weight_attrs) + + w2_weight_offset_second = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // self.group_size, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_offset_second", w2_weight_offset_second) + set_weight_attrs(w2_weight_offset_second, extra_weight_attrs) + + w13_scale_bias = torch.nn.Parameter( + torch.empty( + num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w13_scale_bias", w13_scale_bias) + set_weight_attrs(w13_scale_bias, extra_weight_attrs) + + w2_scale_bias = torch.nn.Parameter( + torch.empty( + num_experts, hidden_size, 16 // self.tp_size, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w2_scale_bias", w2_scale_bias) + set_weight_attrs(w2_scale_bias, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + self.kernel.process_weights_after_loading(layer) + + def create_moe_runner( + self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig + ): + self.moe_runner_config = moe_runner_config + + def apply( + self, + layer: torch.nn.Module, + dispatch_output: StandardDispatchOutput, + ) -> CombineInput: + + return self.kernel.apply(layer, dispatch_output) + + def apply_without_routing_weights( + self, + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ): + return self.kernel.apply_without_routing_weights( + layer, + hidden_states, + hidden_states_scale, + group_list_type, + group_list, + output_dtype, + ) + + class NPUCompressedTensorsW4A16Int4DynamicMoEMethod(CompressedTensorsMoEMethod): def __init__(self, quantization_config) -> None: From ffdc7dc3c8554b354cd18e079d67101a451513c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 13:41:26 +0300 Subject: [PATCH 124/175] Update compressed_tensors_moe.py --- .../quantization/compressed_tensors/compressed_tensors_moe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index c9465828ce0a..1b4c72937504 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1303,6 +1303,8 @@ def apply( class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod): def __init__(self, quantization_config) -> None: + self.group_size = 0 + self.tp_size = 1 self.activation_use_clip = activation_use_clip self.kernel = NPUW4A8Int8DynamicMoEMethod() From c38e16fc7b42c7455f916b3549ab67f6e6059bc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 13:44:31 +0300 Subject: [PATCH 125/175] Update compressed_tensors_moe.py --- .../compressed_tensors/compressed_tensors_moe.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 1b4c72937504..328f916e16e7 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -125,6 +125,14 @@ def get_moe_method( "Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod" ) return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config) + elif ( + quant_config._is_dynamic_token_w4(weight_quant, input_quant) + and input_quant is not None + ): + logger.info_once( + "Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod" + ) + return NPUCompressedTensorsW4A8Int8DynamicMoEMethod(quant_config) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod") return CompressedTensorsW4A4Nvfp4MoEMethod(quant_config) From ef216f4b177409479ef8d325010fca4dfb920f4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 13:47:18 +0300 Subject: [PATCH 126/175] Update compressed_tensors_moe.py --- .../compressed_tensors/compressed_tensors_moe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 328f916e16e7..519a52e6ef96 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1313,7 +1313,11 @@ class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod): def __init__(self, quantization_config) -> None: self.group_size = 0 self.tp_size = 1 - self.activation_use_clip = activation_use_clip + self.activation_use_clip = = ( + self.quantization_config.get("config_groups", {}) + .get("group_1", {}) + .get("activation_use_clip", False) + ) self.kernel = NPUW4A8Int8DynamicMoEMethod() # TODO: See if we can merge this method's logic From 5d43c4ac08bdb761794dd0d5098b238cdfd8827a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 14:17:03 +0300 Subject: [PATCH 127/175] Update compressed_tensors_moe.py --- .../quantization/compressed_tensors/compressed_tensors_moe.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 519a52e6ef96..0fb5bc0d0d64 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1320,9 +1320,6 @@ def __init__(self, quantization_config) -> None: ) self.kernel = NPUW4A8Int8DynamicMoEMethod() - # TODO: See if we can merge this method's logic - # with CompressedTensorsWNA8MoEMethod. Need more models and tests. - # @OrangeRedeng @TamirBaydasov def create_weights( self, layer: torch.nn.Module, From bee77f0934aae6d7f632f2f17d2bec5ed6835d42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 14:29:26 +0300 Subject: [PATCH 128/175] Update compressed_tensors_moe.py --- .../compressed_tensors_moe.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 0fb5bc0d0d64..69507fc76327 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -125,14 +125,6 @@ def get_moe_method( "Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod" ) return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config) - elif ( - quant_config._is_dynamic_token_w4(weight_quant, input_quant) - and input_quant is not None - ): - logger.info_once( - "Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod" - ) - return NPUCompressedTensorsW4A8Int8DynamicMoEMethod(quant_config) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod") return CompressedTensorsW4A4Nvfp4MoEMethod(quant_config) @@ -147,6 +139,17 @@ def get_moe_method( raise NotImplementedError( f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now." ) + elif: + # TODO add w4a8 verification method + if _is_npu: + logger.info_once( + "Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod" + ) + return NPUCompressedTensorsW4A8Int8DynamicMoEMethod(quant_config) + else: + raise NotImplementedError( + f"The W4A8Int8 Fused MoE scheme is implemented only for NPU for now." + ) else: raise RuntimeError( f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}" From ee59b957ec5961c81429eb225977d5bd51ffdfe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 14:34:50 +0300 Subject: [PATCH 129/175] Update fused_moe_method_npu.py --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 06158d8eb580..eb41560daebe 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -311,7 +311,7 @@ def _pack_to_int32(self, weight: torch.Tensor): ), "the last dim of weight needs to be divided by 4" return weight.view(torch.int32).contiguous() - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if not self.activation_use_clip: self._process_weights_without_clip(layer) else: From 02d7a6a1688ef266c44415470fc8489538c8ea7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 14:37:18 +0300 Subject: [PATCH 130/175] Update msmodelslim_moe.py --- .../msmodelslim/msmodelslim_moe.py | 88 ++++++++++++++++++- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 6c71523ff16a..bf59a22c25a1 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -81,9 +81,14 @@ def __init__( self.quant_config = quant_config self.group_size = 0 self.tp_size = 1 + self.activation_use_clip = ( + self.quant_description.get("config_groups", {}) + .get("group_1", {}) + .get("activation_use_clip", False) + ) self.kernel = NPUW4A8Int8DynamicMoEMethod() - def create_weights( +def create_weights( self, layer: torch.nn.Module, num_experts: int, @@ -122,9 +127,13 @@ def create_weights( set_weight_attrs(w2_weight, extra_weight_attrs) # >> scale + weight_scale_dtype = torch.int64 if self.activation_use_clip else torch.float32 w13_weight_scale = torch.nn.Parameter( torch.empty( - num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=weight_scale_dtype, ), requires_grad=False, ) @@ -132,7 +141,7 @@ def create_weights( set_weight_attrs(w13_weight_scale, extra_weight_attrs) w2_weight_scale = torch.nn.Parameter( - torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), + torch.empty(num_experts, hidden_size, 1, dtype=weight_scale_dtype), requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) @@ -156,6 +165,77 @@ def create_weights( set_weight_attrs(w2_weight_offset, extra_weight_attrs) # >>> special param for w4a8 + if self.activation_use_clip: + self._init_activation_clip_params( + layer, + num_experts, + hidden_size, + intermediate_size_per_partition, + extra_weight_attrs, + ) + else: + self._init_extra_scale_params( + layer, + num_experts, + hidden_size, + intermediate_size_per_partition, + extra_weight_attrs, + ) + + def _init_activation_clip_params( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + extra_weight_attrs: dict, + ) -> None: + """ + Initializes bias and alpha parameters for quantization schemes that use activation clipping. + + This helper registers `w13_bias`, `w2_bias`, and `w2_alpha`, which are required to + shift and scale the activations or outputs to compensate for the precision loss + introduced by clamping activations. + """ + w13_bias = torch.nn.Parameter( + torch.ones( + num_experts, 2 * intermediate_size_per_partition, dtype=torch.float + ), + requires_grad=False, + ) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) + + w2_bias = torch.nn.Parameter( + torch.ones(num_experts, hidden_size, dtype=torch.float), + requires_grad=False, + ) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) + + w2_alpha = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float), requires_grad=False + ) + layer.register_parameter("w2_alpha", w2_alpha) + set_weight_attrs(w2_alpha, extra_weight_attrs) + + def _init_extra_scale_params( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + extra_weight_attrs: dict, + ) -> None: + """ + Initializes additional scaling, offset, and bias parameters for quantization schemes without activation clipping. + + This method registers the following parameters: + 1. Scale Biases: `w13_scale_bias` and `w2_scale_bias`. + 2. Secondary Quantization Params (initialized only for grouped quantization): + `w13_weight_scale_second`, `w13_weight_offset_second`, + `w2_weight_scale_second`, and `w2_weight_offset_second`. + """ if not self.is_per_channel_weight: w13_weight_scale_second = torch.nn.Parameter( torch.empty( @@ -168,6 +248,7 @@ def create_weights( ) layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second) set_weight_attrs(w13_weight_scale_second, extra_weight_attrs) + w13_weight_offset_second = torch.nn.Parameter( torch.empty( num_experts, @@ -224,6 +305,7 @@ def create_weights( layer.register_parameter("w2_scale_bias", w2_scale_bias) set_weight_attrs(w2_scale_bias, extra_weight_attrs) + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight) From ff41d738a6ef0bb35fc2515f1f3d5ebc82759704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 14:47:56 +0300 Subject: [PATCH 131/175] Update compressed_tensors_moe.py --- .../quantization/compressed_tensors/compressed_tensors_moe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 69507fc76327..5b68018ebe17 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1313,6 +1313,7 @@ def apply( class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod): + ### TODO: Get rid of code duplication with python/sglang/srt/msmodelslim/msmodelslim_moe.py @OrangeRedeng @TamirBaydasov def __init__(self, quantization_config) -> None: self.group_size = 0 self.tp_size = 1 @@ -1322,7 +1323,7 @@ def __init__(self, quantization_config) -> None: .get("activation_use_clip", False) ) self.kernel = NPUW4A8Int8DynamicMoEMethod() - + def create_weights( self, layer: torch.nn.Module, From 8d1bb48b67200273e4a5d503e1d0cb66aed4bff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 14:53:00 +0300 Subject: [PATCH 132/175] Fix lint issue --- .../quantization/compressed_tensors/compressed_tensors_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 5b68018ebe17..df056208cd9b 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1323,7 +1323,7 @@ def __init__(self, quantization_config) -> None: .get("activation_use_clip", False) ) self.kernel = NPUW4A8Int8DynamicMoEMethod() - + def create_weights( self, layer: torch.nn.Module, From 6b46093e7e3365ac684bcbb5c80b455e1c1b8825 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 15:21:27 +0300 Subject: [PATCH 133/175] Fix lint issue --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index bf59a22c25a1..8a09950f0e6b 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -88,7 +88,7 @@ def __init__( ) self.kernel = NPUW4A8Int8DynamicMoEMethod() -def create_weights( + def create_weights( self, layer: torch.nn.Module, num_experts: int, From 567a771bdcd29c32567517c702d047aa9ae3caed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 15:28:24 +0300 Subject: [PATCH 134/175] Update compressed_tensors_moe.py --- .../quantization/compressed_tensors/compressed_tensors_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index df056208cd9b..0ca79ef4db1f 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -139,7 +139,7 @@ def get_moe_method( raise NotImplementedError( f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now." ) - elif: + elif quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is not None: # TODO add w4a8 verification method if _is_npu: logger.info_once( From 1b2f289382231c985ac18437d9921fab3664eaa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 15:33:21 +0300 Subject: [PATCH 135/175] Update msmodelslim_moe.py --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 8a09950f0e6b..2171295eb317 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -305,7 +305,6 @@ def _init_extra_scale_params( layer.register_parameter("w2_scale_bias", w2_scale_bias) set_weight_attrs(w2_scale_bias, extra_weight_attrs) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight) From fe7067c0241986f1df1498b36fba611d62c0437f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 15:33:58 +0300 Subject: [PATCH 136/175] Update compressed_tensors_moe.py --- .../quantization/compressed_tensors/compressed_tensors_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 0ca79ef4db1f..6b2421347993 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1317,7 +1317,7 @@ class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod): def __init__(self, quantization_config) -> None: self.group_size = 0 self.tp_size = 1 - self.activation_use_clip = = ( + self.activation_use_clip = ( self.quantization_config.get("config_groups", {}) .get("group_1", {}) .get("activation_use_clip", False) From 2e390e3da605aa3aec9e9644e7dcdbb4edcea8e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 15:42:04 +0300 Subject: [PATCH 137/175] Fix lint issue --- .../compressed_tensors/compressed_tensors_moe.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 6b2421347993..ee78d6706023 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -139,12 +139,13 @@ def get_moe_method( raise NotImplementedError( f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now." ) - elif quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is not None: + elif ( + quant_config._is_dynamic_token_w4(weight_quant, input_quant) + and input_quant is not None\ + ): # TODO add w4a8 verification method if _is_npu: - logger.info_once( - "Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod" - ) + logger.info_once("Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod") return NPUCompressedTensorsW4A8Int8DynamicMoEMethod(quant_config) else: raise NotImplementedError( From ee17e0cc8cb1d6c4199688a642521e760a0f7060 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 15:52:53 +0300 Subject: [PATCH 138/175] Update msmodelslim_moe.py --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 2171295eb317..ed379debbb67 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -82,7 +82,7 @@ def __init__( self.group_size = 0 self.tp_size = 1 self.activation_use_clip = ( - self.quant_description.get("config_groups", {}) + self.quant_config.get("config_groups", {}) .get("group_1", {}) .get("activation_use_clip", False) ) From 662fadaaf70f605487bf58bffb78fbc1ef810704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 15:57:13 +0300 Subject: [PATCH 139/175] Update msmodelslim_moe.py --- .../msmodelslim/msmodelslim_moe.py | 63 +------------------ 1 file changed, 2 insertions(+), 61 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index ed379debbb67..2f1cfd57fb03 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -81,11 +81,7 @@ def __init__( self.quant_config = quant_config self.group_size = 0 self.tp_size = 1 - self.activation_use_clip = ( - self.quant_config.get("config_groups", {}) - .get("group_1", {}) - .get("activation_use_clip", False) - ) + self.activation_use_clip = False self.kernel = NPUW4A8Int8DynamicMoEMethod() def create_weights( @@ -181,61 +177,6 @@ def create_weights( intermediate_size_per_partition, extra_weight_attrs, ) - - def _init_activation_clip_params( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - extra_weight_attrs: dict, - ) -> None: - """ - Initializes bias and alpha parameters for quantization schemes that use activation clipping. - - This helper registers `w13_bias`, `w2_bias`, and `w2_alpha`, which are required to - shift and scale the activations or outputs to compensate for the precision loss - introduced by clamping activations. - """ - w13_bias = torch.nn.Parameter( - torch.ones( - num_experts, 2 * intermediate_size_per_partition, dtype=torch.float - ), - requires_grad=False, - ) - layer.register_parameter("w13_bias", w13_bias) - set_weight_attrs(w13_bias, extra_weight_attrs) - - w2_bias = torch.nn.Parameter( - torch.ones(num_experts, hidden_size, dtype=torch.float), - requires_grad=False, - ) - layer.register_parameter("w2_bias", w2_bias) - set_weight_attrs(w2_bias, extra_weight_attrs) - - w2_alpha = torch.nn.Parameter( - torch.ones(num_experts, dtype=torch.float), requires_grad=False - ) - layer.register_parameter("w2_alpha", w2_alpha) - set_weight_attrs(w2_alpha, extra_weight_attrs) - - def _init_extra_scale_params( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - extra_weight_attrs: dict, - ) -> None: - """ - Initializes additional scaling, offset, and bias parameters for quantization schemes without activation clipping. - - This method registers the following parameters: - 1. Scale Biases: `w13_scale_bias` and `w2_scale_bias`. - 2. Secondary Quantization Params (initialized only for grouped quantization): - `w13_weight_scale_second`, `w13_weight_offset_second`, - `w2_weight_scale_second`, and `w2_weight_offset_second`. - """ if not self.is_per_channel_weight: w13_weight_scale_second = torch.nn.Parameter( torch.empty( @@ -306,7 +247,7 @@ def _init_extra_scale_params( set_weight_attrs(w2_scale_bias, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight) + self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight, self.activation_use_clip) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" From 2fb272d65bd9817a6ce036e2d350344317f46a10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 15:59:28 +0300 Subject: [PATCH 140/175] Update compressed_tensors_moe.py --- .../quantization/compressed_tensors/compressed_tensors_moe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index ee78d6706023..fa02ded08c0a 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1317,6 +1317,7 @@ class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod): ### TODO: Get rid of code duplication with python/sglang/srt/msmodelslim/msmodelslim_moe.py @OrangeRedeng @TamirBaydasov def __init__(self, quantization_config) -> None: self.group_size = 0 + self.is_per_channel_weight = self.group_size == 0 self.tp_size = 1 self.activation_use_clip = ( self.quantization_config.get("config_groups", {}) @@ -1336,7 +1337,6 @@ def create_weights( ) -> None: from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported - self.is_per_channel_weight = self.group_size == 0 self.num_experts = num_experts extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} @@ -1543,7 +1543,7 @@ def _init_extra_scale_params( set_weight_attrs(w2_scale_bias, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - self.kernel.process_weights_after_loading(layer) + self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight, self.activation_use_clip) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig From ae7875c9bf93774631d6c418e16dedc208d1a20c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 15:59:59 +0300 Subject: [PATCH 141/175] Update msmodelslim_moe.py --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 2f1cfd57fb03..fa0d85f328d6 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -80,6 +80,7 @@ def __init__( ): self.quant_config = quant_config self.group_size = 0 + self.is_per_channel_weight = self.group_size == 0 self.tp_size = 1 self.activation_use_clip = False self.kernel = NPUW4A8Int8DynamicMoEMethod() @@ -95,7 +96,6 @@ def create_weights( ) -> None: from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported - self.is_per_channel_weight = self.group_size == 0 self.num_experts = num_experts extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} From b4c0ebe0c6560a7a9b7edbd5ffad81728d213306 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:03:16 +0300 Subject: [PATCH 142/175] Update fused_moe_method_npu.py --- .../npu/quantization/fused_moe_method_npu.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index eb41560daebe..b4aa715aed03 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -311,9 +311,9 @@ def _pack_to_int32(self, weight: torch.Tensor): ), "the last dim of weight needs to be divided by 4" return weight.view(torch.int32).contiguous() - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - if not self.activation_use_clip: - self._process_weights_without_clip(layer) + def process_weights_after_loading(self, layer: torch.nn.Module, is_per_channel_weight, activation_use_clip) -> None: + if not activation_use_clip: + self._process_weights_without_clip(layer, is_per_channel_weight) else: self._process_weights_with_clip(layer) @@ -330,7 +330,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data) layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) - def _process_weights_without_clip(self, layer: torch.nn.Module) -> None: + def _process_weights_without_clip(self, layer: torch.nn.Module, is_per_channel_weight) -> None: w13_weight_scale_second = ( layer.w13_weight_scale_second.data if hasattr(layer, "w13_weight_scale_second") @@ -342,10 +342,10 @@ def _process_weights_without_clip(self, layer: torch.nn.Module) -> None: else None ) layer.w13_weight_scale.data, w13_bias = self.process_scale( - layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second + layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second, is_per_channel_weight, ) layer.w2_weight_scale.data, w2_bias = self.process_scale( - layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second + layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second, is_per_channel_weight, ) if hasattr(layer, "w13_weight_scale_second"): # scale_second is no longer used, release this part of the memory From 897094ccb8c6e519513dd074ae0d4609552c8b73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:07:21 +0300 Subject: [PATCH 143/175] Update msmodelslim_moe.py --- .../msmodelslim/msmodelslim_moe.py | 26 +++---------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index fa0d85f328d6..856bbd32574f 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -96,6 +96,7 @@ def create_weights( ) -> None: from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported + self.is_per_channel_weight = self.group_size == 0 self.num_experts = num_experts extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} @@ -123,13 +124,9 @@ def create_weights( set_weight_attrs(w2_weight, extra_weight_attrs) # >> scale - weight_scale_dtype = torch.int64 if self.activation_use_clip else torch.float32 w13_weight_scale = torch.nn.Parameter( torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - 1, - dtype=weight_scale_dtype, + num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 ), requires_grad=False, ) @@ -137,7 +134,7 @@ def create_weights( set_weight_attrs(w13_weight_scale, extra_weight_attrs) w2_weight_scale = torch.nn.Parameter( - torch.empty(num_experts, hidden_size, 1, dtype=weight_scale_dtype), + torch.empty(num_experts, hidden_size, 1, dtype=torch.float32), requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) @@ -161,22 +158,6 @@ def create_weights( set_weight_attrs(w2_weight_offset, extra_weight_attrs) # >>> special param for w4a8 - if self.activation_use_clip: - self._init_activation_clip_params( - layer, - num_experts, - hidden_size, - intermediate_size_per_partition, - extra_weight_attrs, - ) - else: - self._init_extra_scale_params( - layer, - num_experts, - hidden_size, - intermediate_size_per_partition, - extra_weight_attrs, - ) if not self.is_per_channel_weight: w13_weight_scale_second = torch.nn.Parameter( torch.empty( @@ -189,7 +170,6 @@ def create_weights( ) layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second) set_weight_attrs(w13_weight_scale_second, extra_weight_attrs) - w13_weight_offset_second = torch.nn.Parameter( torch.empty( num_experts, From b4636259e2dd7ad31e22b306be4cbcd1ecc7c28b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:12:36 +0300 Subject: [PATCH 144/175] Fix lint issue --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 856bbd32574f..6b74c7fe2eba 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -227,7 +227,11 @@ def create_weights( set_weight_attrs(w2_scale_bias, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight, self.activation_use_clip) + self.kernel.process_weights_after_loading( + layer, + self.is_per_channel_weight, + self.activation_use_clip + ) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" From 349dcd09986437f03d83dc701f839d4a763a7c81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:14:21 +0300 Subject: [PATCH 145/175] Fix lint issue --- .../compressed_tensors/compressed_tensors_moe.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index fa02ded08c0a..9a302dbe92a0 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -140,8 +140,8 @@ def get_moe_method( f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now." ) elif ( - quant_config._is_dynamic_token_w4(weight_quant, input_quant) - and input_quant is not None\ + quant_config._is_dynamic_token_w4(weight_quant, input_quant) + and input_quant is not None ): # TODO add w4a8 verification method if _is_npu: @@ -1543,7 +1543,11 @@ def _init_extra_scale_params( set_weight_attrs(w2_scale_bias, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight, self.activation_use_clip) + self.kernel.process_weights_after_loading( + layer, + self.is_per_channel_weight, + self.activation_use_clip + ) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig From b77689546a48d3086d74c19fe5020c2ab023c17f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:15:40 +0300 Subject: [PATCH 146/175] Update fused_moe_method_npu.py --- .../npu/quantization/fused_moe_method_npu.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index b4aa715aed03..00acf80b793c 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -311,7 +311,9 @@ def _pack_to_int32(self, weight: torch.Tensor): ), "the last dim of weight needs to be divided by 4" return weight.view(torch.int32).contiguous() - def process_weights_after_loading(self, layer: torch.nn.Module, is_per_channel_weight, activation_use_clip) -> None: + def process_weights_after_loading( + self, layer: torch.nn.Module, is_per_channel_weight, activation_use_clip + ) -> None: if not activation_use_clip: self._process_weights_without_clip(layer, is_per_channel_weight) else: @@ -342,10 +344,16 @@ def _process_weights_without_clip(self, layer: torch.nn.Module, is_per_channel_w else None ) layer.w13_weight_scale.data, w13_bias = self.process_scale( - layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second, is_per_channel_weight, + layer.w13_weight, + layer.w13_weight_scale.data, + w13_weight_scale_second, + is_per_channel_weight, ) layer.w2_weight_scale.data, w2_bias = self.process_scale( - layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second, is_per_channel_weight, + layer.w2_weight, + layer.w2_weight_scale.data, + w2_weight_scale_second, + is_per_channel_weight, ) if hasattr(layer, "w13_weight_scale_second"): # scale_second is no longer used, release this part of the memory From 56c8d06fdf0b04e58f9883faffb96f1bc33c6944 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:20:40 +0300 Subject: [PATCH 147/175] Fix lint issue --- .../npu/quantization/fused_moe_method_npu.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 00acf80b793c..5ffd9d8e3a6b 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -332,7 +332,11 @@ def process_weights_after_loading( layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data) layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) - def _process_weights_without_clip(self, layer: torch.nn.Module, is_per_channel_weight) -> None: + def _process_weights_without_clip( + self, + layer: torch.nn.Module, + is_per_channel_weight + ) -> None: w13_weight_scale_second = ( layer.w13_weight_scale_second.data if hasattr(layer, "w13_weight_scale_second") @@ -344,15 +348,15 @@ def _process_weights_without_clip(self, layer: torch.nn.Module, is_per_channel_w else None ) layer.w13_weight_scale.data, w13_bias = self.process_scale( - layer.w13_weight, - layer.w13_weight_scale.data, - w13_weight_scale_second, + layer.w13_weight, + layer.w13_weight_scale.data, + w13_weight_scale_second, is_per_channel_weight, ) layer.w2_weight_scale.data, w2_bias = self.process_scale( - layer.w2_weight, - layer.w2_weight_scale.data, - w2_weight_scale_second, + layer.w2_weight, + layer.w2_weight_scale.data, + w2_weight_scale_second, is_per_channel_weight, ) if hasattr(layer, "w13_weight_scale_second"): From 4e9c0d07b983a9385812d35f2ed73fce6fe20360 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:21:54 +0300 Subject: [PATCH 148/175] Fix lint issue --- .../quantization/compressed_tensors/compressed_tensors_moe.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 9a302dbe92a0..e10f9d5e50a1 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1544,9 +1544,7 @@ def _init_extra_scale_params( def process_weights_after_loading(self, layer: torch.nn.Module) -> None: self.kernel.process_weights_after_loading( - layer, - self.is_per_channel_weight, - self.activation_use_clip + layer, self.is_per_channel_weight, self.activation_use_clip ) def create_moe_runner( From f091ab09b8c8bdbf2aa41449697c5b720ad3323d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:22:42 +0300 Subject: [PATCH 149/175] Fix lint issue --- .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py index 6b74c7fe2eba..c9dc5621cadf 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py @@ -228,9 +228,7 @@ def create_weights( def process_weights_after_loading(self, layer: torch.nn.Module) -> None: self.kernel.process_weights_after_loading( - layer, - self.is_per_channel_weight, - self.activation_use_clip + layer, self.is_per_channel_weight, self.activation_use_clip ) def create_moe_runner( From 30ea24ef8c39dc85f9c96d33b2d229fe9becaf18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:30:00 +0300 Subject: [PATCH 150/175] Fix lint issue --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 5ffd9d8e3a6b..3e26ff354fde 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -333,9 +333,7 @@ def process_weights_after_loading( layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) def _process_weights_without_clip( - self, - layer: torch.nn.Module, - is_per_channel_weight + self, layer: torch.nn.Module, is_per_channel_weight ) -> None: w13_weight_scale_second = ( layer.w13_weight_scale_second.data From b430667326d29f3f0277a892b80d52a9fd98533d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 17:30:47 +0300 Subject: [PATCH 151/175] Update fused_moe_method_npu.py --- .../npu/quantization/fused_moe_method_npu.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 3e26ff354fde..91a5da075807 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -329,8 +329,8 @@ def process_weights_after_loading( layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) - layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data) - layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) + layer.w13_weight.data = self._pack_to_int32(layer.w13_weight.data) + layer.w2_weight.data = self._pack_to_int32(layer.w2_weight.data) def _process_weights_without_clip( self, layer: torch.nn.Module, is_per_channel_weight @@ -345,13 +345,13 @@ def _process_weights_without_clip( if hasattr(layer, "w2_weight_scale_second") else None ) - layer.w13_weight_scale.data, w13_bias = self.process_scale( + layer.w13_weight_scale.data, w13_bias = self._process_scale( layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second, is_per_channel_weight, ) - layer.w2_weight_scale.data, w2_bias = self.process_scale( + layer.w2_weight_scale.data, w2_bias = self._process_scale( layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second, @@ -364,7 +364,7 @@ def _process_weights_without_clip( del layer.w13_weight_offset_second del layer.w2_weight_offset_second - self.update_bias(layer, w13_bias, w2_bias) + self._update_bias(layer, w13_bias, w2_bias) def _process_weights_with_clip(self, layer: torch.nn.Module) -> None: w13_weight_scale = ( From 47e8406982dfb879c46647271e128a3cf7678ad1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 18:09:45 +0300 Subject: [PATCH 152/175] Update test_ascend_w4a4_quantization.py --- test/srt/ascend/test_ascend_w4a4_quantization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index 5164d72eb513..ba4049c6d8a9 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -75,8 +75,8 @@ def test_gsm8k(self): metrics = run_eval(args) print(metrics) - self.assertAlmostEqual(metrics["accuracy"], 0.84) - self.assertAlmostEqual(metrics["output_throughput"], 1100) + self.assertGreaterEqual(metrics["accuracy"], 0.80) + self.assertGreaterEqual(metrics["output_throughput"], 1000) def run_decode(self, max_new_tokens): response = requests.post( From 97b38e4eb33c8a645ddc6ab3b2560db02a3587b8 Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Mon, 29 Dec 2025 22:02:50 +0300 Subject: [PATCH 153/175] Rename MsModelSlim -> ModelSlim --- docs/platforms/ascend_npu_quantization.md | 15 ++++++++------- python/sglang/srt/layers/quantization/__init__.py | 2 +- .../compressed_tensors/compressed_tensors_moe.py | 2 +- .../srt/layers/quantization/modelslim/README.md | 14 ++++++++++++++ .../msmodelslim.py => modelslim/modelslim.py} | 6 ++---- .../modelslim_moe.py} | 2 +- .../quantization/modelslim/schemes/__init__.py | 11 +++++++++++ .../schemes/modelslim_scheme.py} | 0 .../schemes/modelslim_w4a4_int4.py} | 2 +- .../schemes/modelslim_w8a8_int8.py} | 2 +- .../srt/layers/quantization/msmodelslim/README.md | 14 -------------- .../quantization/msmodelslim/schemes/__init__.py | 11 ----------- 12 files changed, 40 insertions(+), 41 deletions(-) create mode 100644 python/sglang/srt/layers/quantization/modelslim/README.md rename python/sglang/srt/layers/quantization/{msmodelslim/msmodelslim.py => modelslim/modelslim.py} (98%) rename python/sglang/srt/layers/quantization/{msmodelslim/msmodelslim_moe.py => modelslim/modelslim_moe.py} (99%) create mode 100644 python/sglang/srt/layers/quantization/modelslim/schemes/__init__.py rename python/sglang/srt/layers/quantization/{msmodelslim/schemes/msmodelslim_scheme.py => modelslim/schemes/modelslim_scheme.py} (100%) rename python/sglang/srt/layers/quantization/{msmodelslim/schemes/msmodelslim_w4a4_int4.py => modelslim/schemes/modelslim_w4a4_int4.py} (97%) rename python/sglang/srt/layers/quantization/{msmodelslim/schemes/msmodelslim_w8a8_int8.py => modelslim/schemes/modelslim_w8a8_int8.py} (98%) delete mode 100644 python/sglang/srt/layers/quantization/msmodelslim/README.md delete mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md index 172b5e295fb6..4c40fde6e170 100644 --- a/docs/platforms/ascend_npu_quantization.md +++ b/docs/platforms/ascend_npu_quantization.md @@ -2,19 +2,20 @@ Quantization on Ascend. To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` or `config.json` config. -MsModelSlim on Ascend support: +[ModelSlim on Ascend support](https://github.com/sgl-project/sglang/pull/14504): - [x] W4A4 dynamic linear - [x] W8A8 static linear - [x] W8A8 dynamic linear - [x] W4A8 dynamic MOE - [x] W8A8 dynamic MOE -AWQ on Ascend support: +[AWQ on Ascend support](https://github.com/sgl-project/sglang/pull/10158): - [x] W4A16 linear -- [x] W8A16 linear # Test required -- [x] W4A16 MOE # Test required +- [x] W8A16 linear # Need to test +- [x] W4A16 MOE # Need to test Compressed-tensors (LLM Compressor) on Ascend support: -- [x] W8A8 dynamic linear -- [x] W8A8 dynamic MOE -- [x] W4A16 MOE +- [x] [W4A8 dynamic MOE with/without activation clip](https://github.com/sgl-project/sglang/pull/14736) # Need to test +- [x] [W4A16 MOE](https://github.com/sgl-project/sglang/pull/12759) +- [x] [W8A8 dynamic linear](https://github.com/sgl-project/sglang/pull/14504) +- [x] [W8A8 dynamic MOE](https://github.com/sgl-project/sglang/pull/14504) diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index 4aa1843a4d85..161301d6124e 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -79,7 +79,7 @@ def override_quantization_method(self, *args, **kwargs): ) if is_npu(): - from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig + from sglang.srt.layers.quantization.modelslim.modelslim import ModelSlimConfig BASE_QUANTIZATION_METHODS.update( { diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index e10f9d5e50a1..15ddb1d25a8b 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1314,7 +1314,7 @@ def apply( class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod): - ### TODO: Get rid of code duplication with python/sglang/srt/msmodelslim/msmodelslim_moe.py @OrangeRedeng @TamirBaydasov + ### TODO: Get rid of code duplication with python/sglang/srt/modelslim/modelslim_moe.py @OrangeRedeng @TamirBaydasov def __init__(self, quantization_config) -> None: self.group_size = 0 self.is_per_channel_weight = self.group_size == 0 diff --git a/python/sglang/srt/layers/quantization/modelslim/README.md b/python/sglang/srt/layers/quantization/modelslim/README.md new file mode 100644 index 000000000000..3d34b67ae712 --- /dev/null +++ b/python/sglang/srt/layers/quantization/modelslim/README.md @@ -0,0 +1,14 @@ +Quantization [ModelSlim](https://gitcode.com/Ascend/msit/tree/master/modelslim) module. + +`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights. For models quantized with ModelSlim, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config. + +ModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as: +- [x] W4A4 dynamic linear +- [x] W8A8 static linear +- [x] W8A8 dynamic linear +- [x] W4A8 dynamic MOE +- [x] W8A8 dynamic MOE + +Also ModelSlim module include: +- [x] Automated config detection for modelslim format (without the need to specify --quantization modelslim flag) +- [x] Unit-tests for w4a4 modelslim, w8a8 modelslim diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/modelslim/modelslim.py similarity index 98% rename from python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py rename to python/sglang/srt/layers/quantization/modelslim/modelslim.py index 61913209da4a..20b6c88a1d9d 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py +++ b/python/sglang/srt/layers/quantization/modelslim/modelslim.py @@ -14,10 +14,8 @@ QuantizeMethodBase, ) from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer -from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import ( - ModelSlimMoEMethod, -) -from sglang.srt.layers.quantization.msmodelslim.schemes import ( +from sglang.srt.layers.quantization.modelslim.modelslim_moe import ModelSlimMoEMethod +from sglang.srt.layers.quantization.modelslim.schemes import ( ModelSlimScheme, ModelSlimW4A4Int4, ModelSlimW8A8Int8, diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py similarity index 99% rename from python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py rename to python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py index c9dc5621cadf..94d1d3a660c2 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py +++ b/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py @@ -20,7 +20,7 @@ CombineInput, StandardDispatchOutput, ) - from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig + from sglang.srt.layers.quantization.modelslim.modelslim import ModelSlimConfig logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/layers/quantization/modelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/modelslim/schemes/__init__.py new file mode 100644 index 000000000000..551b862a4424 --- /dev/null +++ b/python/sglang/srt/layers/quantization/modelslim/schemes/__init__.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 + +from .modelslim_scheme import ModelSlimScheme +from .modelslim_w4a4_int4 import ModelSlimW4A4Int4 +from .modelslim_w8a8_int8 import ModelSlimW8A8Int8 + +__all__ = [ + "ModelSlimScheme", + "ModelSlimW8A8Int8", + "ModelSlimW4A4Int4", +] diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py b/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_scheme.py similarity index 100% rename from python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py rename to python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_scheme.py diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a4_int4.py similarity index 97% rename from python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py rename to python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a4_int4.py index 6fb7561cc438..8e7f08277f99 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py +++ b/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a4_int4.py @@ -9,7 +9,7 @@ NPU_W4A4DynamicLinearMethod, ) from sglang.srt.layers.parameter import PerTensorScaleParameter -from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme +from sglang.srt.layers.quantization.modelslim.schemes import ModelSlimScheme from sglang.srt.utils import set_weight_attrs diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8.py similarity index 98% rename from python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py rename to python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8.py index 9986e1976eaf..16c62d551fa3 100644 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8.py @@ -14,7 +14,7 @@ ModelWeightParameter, PerTensorScaleParameter, ) -from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme +from sglang.srt.layers.quantization.modelslim.schemes import ModelSlimScheme class ModelSlimW8A8Int8(ModelSlimScheme): diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md deleted file mode 100644 index 65f5eb029323..000000000000 --- a/python/sglang/srt/layers/quantization/msmodelslim/README.md +++ /dev/null @@ -1,14 +0,0 @@ -Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module. - -`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights. For models quantized with MSModelSlim, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config. - -MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as: -- [x] W4A4 dynamic linear -- [x] W8A8 static linear -- [x] W8A8 dynamic linear -- [x] W4A8 dynamic MOE -- [x] W8A8 dynamic MOE - -Also MsModelSlim module include: -- [x] Automated config detection for modelslim format (without the need to specify --quantization modelslim flag) -- [x] Unit-tests for w4a4 modelslim, w8a8 modelslim diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py deleted file mode 100644 index fba516eed7c0..000000000000 --- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from .msmodelslim_scheme import ModelSlimScheme -from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4 -from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8 - -__all__ = [ - "ModelSlimScheme", - "ModelSlimW8A8Int8", - "ModelSlimW4A4Int4", -] From d6f0064f82eecb984db59b40ec0dedf88e0b3a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 30 Dec 2025 08:40:40 +0300 Subject: [PATCH 154/175] Fix w4a4 test --- test/srt/ascend/test_ascend_w4a4_quantization.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index ba4049c6d8a9..33d1f62a81b4 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -78,20 +78,6 @@ def test_gsm8k(self): self.assertGreaterEqual(metrics["accuracy"], 0.80) self.assertGreaterEqual(metrics["output_throughput"], 1000) - def run_decode(self, max_new_tokens): - response = requests.post( - self.base_url + "/generate", - json={ - "text": "The capital of France is", - "sampling_params": { - "temperature": 0, - "max_new_tokens": max_new_tokens, - }, - "ignore_eos": True, - }, - ) - return response.json() - def test_throughput(self): max_tokens = 256 @@ -103,7 +89,7 @@ def test_throughput(self): print(f"Throughput: {throughput} tokens/s") if is_in_ci(): - self.assertAlmostEqual(throughput, 38) + self.assertAlmostEqual(throughput, 35) if __name__ == "__main__": From 0aad1d189ad69d1f0732e4dc01f67dba888bc0ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 30 Dec 2025 08:45:36 +0300 Subject: [PATCH 155/175] Fix link issue --- test/srt/ascend/test_ascend_w4a4_quantization.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index 33d1f62a81b4..bcaaf5d3fc84 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -9,8 +9,6 @@ from types import SimpleNamespace from urllib.parse import urlparse -import requests - from sglang.srt.utils import kill_process_tree from sglang.test.few_shot_gsm8k import run_eval from sglang.test.test_utils import ( From e861924a2762d6d800b48011ba502231c9b13078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 30 Dec 2025 08:50:44 +0300 Subject: [PATCH 156/175] Return run_decode to test_ascend_w4a4_quantization.py --- test/srt/ascend/test_ascend_w4a4_quantization.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index bcaaf5d3fc84..2bc0df0c1687 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -9,6 +9,8 @@ from types import SimpleNamespace from urllib.parse import urlparse +import requests + from sglang.srt.utils import kill_process_tree from sglang.test.few_shot_gsm8k import run_eval from sglang.test.test_utils import ( @@ -76,6 +78,20 @@ def test_gsm8k(self): self.assertGreaterEqual(metrics["accuracy"], 0.80) self.assertGreaterEqual(metrics["output_throughput"], 1000) + def run_decode(self, max_new_tokens): + response = requests.post( + self.base_url + "/generate", + json={ + "text": "The capital of France is", + "sampling_params": { + "temperature": 0, + "max_new_tokens": max_new_tokens, + }, + "ignore_eos": True, + }, + ) + return response.json() + def test_throughput(self): max_tokens = 256 From a443cf976d51b6ebd992f08275824651c872b02d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 30 Dec 2025 09:34:37 +0300 Subject: [PATCH 157/175] Update modelslim_moe.py --- .../srt/layers/quantization/modelslim/modelslim_moe.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py b/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py index 94d1d3a660c2..095d09f31155 100644 --- a/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py +++ b/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py @@ -241,11 +241,8 @@ def apply( layer, dispatch_output: "StandardDispatchOutput", ) -> "CombineInput": - # FIXME W4A8 without EP gives 0 accuracy - raise NotImplementedError( - f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep" - ) - # return self.kernel.apply(layer, dispatch_output) + # FIXME W4A8 without EP can give 0 accuracy + return self.kernel.apply(layer, dispatch_output) def apply_without_routing_weights( self, From 373b9c5ac967a1cff11eace94f99624dc0cfdc9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 30 Dec 2025 09:37:01 +0300 Subject: [PATCH 158/175] Fix link --- python/sglang/srt/layers/quantization/modelslim/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/modelslim/README.md b/python/sglang/srt/layers/quantization/modelslim/README.md index 3d34b67ae712..dd3f35ff3fbf 100644 --- a/python/sglang/srt/layers/quantization/modelslim/README.md +++ b/python/sglang/srt/layers/quantization/modelslim/README.md @@ -1,4 +1,4 @@ -Quantization [ModelSlim](https://gitcode.com/Ascend/msit/tree/master/modelslim) module. +Quantization [ModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module. `--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights. For models quantized with ModelSlim, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config. From 86093bbf11c0a0d6f6256d615af6c56988ba6280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 30 Dec 2025 09:38:41 +0300 Subject: [PATCH 159/175] Fix link again --- python/sglang/srt/layers/quantization/modelslim/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/modelslim/README.md b/python/sglang/srt/layers/quantization/modelslim/README.md index dd3f35ff3fbf..d2a43d696741 100644 --- a/python/sglang/srt/layers/quantization/modelslim/README.md +++ b/python/sglang/srt/layers/quantization/modelslim/README.md @@ -1,4 +1,4 @@ -Quantization [ModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module. +Quantization [ModelSlim](https://gitcode.com/Ascend/msit) module. `--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights. For models quantized with ModelSlim, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config. From 70f2fabbe544cb2d1d10a9086230a13735f55e4d Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Tue, 30 Dec 2025 11:18:22 +0300 Subject: [PATCH 160/175] Add w4a8 strategy to compressed-tensors --- .../compressed_tensors/compressed_tensors.py | 22 +++++++++++++++++++ .../compressed_tensors_moe.py | 6 +---- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index 0ed642950fbc..e3f9725e5440 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -310,6 +310,28 @@ def _check_scheme_supported(self, min_capability: int, error: bool = True) -> bo else: return False + def _is_dynamic_token_w4a8( + self, weight_quant: BaseModel, input_quant: BaseModel + ) -> bool: + is_weight_4_bits = weight_quant.num_bits == 4 + is_activation_8_bits = input_quant.num_bits == 8 + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.GROUP.value + or weight_quant.strategy == QuantizationStrategy.CHANNEL.value + ) + is_token = ( + weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value + ) + is_dynamic = not weight_quant.dynamic and input_quant.dynamic + + return ( + is_weight_4_bits + and is_activation_8_bits + and is_token + and weight_quant.symmetric + and is_dynamic + ) + def _is_static_tensor_w8a8( self, weight_quant: BaseModel, input_quant: BaseModel ) -> bool: diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 15ddb1d25a8b..7ef04ef637ad 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -139,11 +139,7 @@ def get_moe_method( raise NotImplementedError( f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now." ) - elif ( - quant_config._is_dynamic_token_w4(weight_quant, input_quant) - and input_quant is not None - ): - # TODO add w4a8 verification method + elif quant_config._is_dynamic_token_w4a8(weight_quant, input_quant): if _is_npu: logger.info_once("Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod") return NPUCompressedTensorsW4A8Int8DynamicMoEMethod(quant_config) From d5ad3a19ac52b58017abac2371b7a6daddff533a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 30 Dec 2025 14:01:44 +0300 Subject: [PATCH 161/175] Fix test again --- test/srt/ascend/test_ascend_w4a4_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index 2bc0df0c1687..e77adb7df159 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -103,7 +103,7 @@ def test_throughput(self): print(f"Throughput: {throughput} tokens/s") if is_in_ci(): - self.assertAlmostEqual(throughput, 35) + self.assertGreaterEqual(throughput, 35) if __name__ == "__main__": From a657e8727ffd99aeee27c96ae7e5cf127227b060 Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Sat, 3 Jan 2026 21:47:47 +0300 Subject: [PATCH 162/175] Update test order --- test/srt/ascend/test_ascend_w4a4_quantization.py | 4 ++-- test/srt/run_suite.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index e77adb7df159..8674786a8433 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -76,7 +76,7 @@ def test_gsm8k(self): print(metrics) self.assertGreaterEqual(metrics["accuracy"], 0.80) - self.assertGreaterEqual(metrics["output_throughput"], 1000) + self.assertGreaterEqual(metrics["output_throughput"], 700) def run_decode(self, max_new_tokens): response = requests.post( @@ -103,7 +103,7 @@ def test_throughput(self): print(f"Throughput: {throughput} tokens/s") if is_in_ci(): - self.assertGreaterEqual(throughput, 35) + self.assertGreaterEqual(throughput, 25) if __name__ == "__main__": diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index ad726b40b238..777382c83388 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -364,12 +364,12 @@ TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400), ], "per-commit-4-npu-a2": [ + TestFile("ascend/test_ascend_w4a4_quantization.py", 400), TestFile("ascend/test_ascend_mla_w8a8int8.py", 400), TestFile("ascend/test_ascend_hicache_mla.py", 400), TestFile("ascend/test_ascend_tp4_bf16.py", 400), ], "per-commit-16-npu-a3": [ - TestFile("ascend/test_ascend_w4a4_quantization.py", 400), TestFile("ascend/test_ascend_deepep.py", 400), TestFile("ascend/test_ascend_deepseek_mtp.py", 400), ], From ff565dbc927ff6a13ac11857d861e6d373e08174 Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Sun, 4 Jan 2026 13:44:46 +0300 Subject: [PATCH 163/175] Move w4a4_test to a2-tp1 suite --- test/srt/ascend/test_ascend_w4a4_quantization.py | 14 ++++++-------- test/srt/run_suite.py | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index 8674786a8433..f6982bfcaa82 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -21,10 +21,8 @@ popen_launch_server, ) -os.environ["SGLANG_USE_MODELSCOPE"] = "true" - if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3" + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0" DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( 7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 ) @@ -47,11 +45,11 @@ def setUpClass(cls): "--attention-backend", "ascend", "--tp-size", - "4", + "1", "--mem-fraction-static", "0.8", "--cuda-graph-bs", - "64", + "16", "--disable-radix-cache", ], ) @@ -68,7 +66,7 @@ def test_gsm8k(self): data_path=None, num_questions=1319, max_new_tokens=512, - parallel=64, + parallel=16, host=f"http://{url.hostname}", port=int(url.port), ) @@ -76,7 +74,7 @@ def test_gsm8k(self): print(metrics) self.assertGreaterEqual(metrics["accuracy"], 0.80) - self.assertGreaterEqual(metrics["output_throughput"], 700) + self.assertGreaterEqual(metrics["output_throughput"], 500) def run_decode(self, max_new_tokens): response = requests.post( @@ -103,7 +101,7 @@ def test_throughput(self): print(f"Throughput: {throughput} tokens/s") if is_in_ci(): - self.assertGreaterEqual(throughput, 25) + self.assertGreaterEqual(throughput, 15) if __name__ == "__main__": diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 6862cb72b061..233f0d9604e0 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -355,6 +355,7 @@ TestFile("ascend/test_ascend_sampling_backend.py", 400), TestFile("ascend/test_ascend_tp1_bf16.py", 400), TestFile("ascend/test_ascend_compile_graph_tp1_bf16.py", 400), + TestFile("ascend/test_ascend_w4a4_quantization.py", 400), TestFile("ascend/test_ascend_w8a8_quantization.py", 400), TestFile("test_embed_interpolate_unittest.py", 400), ], @@ -365,7 +366,6 @@ TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400), ], "per-commit-4-npu-a2": [ - TestFile("ascend/test_ascend_w4a4_quantization.py", 400), TestFile("ascend/test_ascend_mla_w8a8int8.py", 400), TestFile("ascend/test_ascend_hicache_mla.py", 400), TestFile("ascend/test_ascend_tp4_bf16.py", 400), From c97c232aadb8d7fadb288d7da46f48078d79ad70 Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Sun, 4 Jan 2026 13:47:46 +0300 Subject: [PATCH 164/175] Move w4a4_test to a2-tp1 suite --- .../ascend/test_ascend_w4a4_quantization.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index f6982bfcaa82..420ffd113bbf 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -9,8 +9,6 @@ from types import SimpleNamespace from urllib.parse import urlparse -import requests - from sglang.srt.utils import kill_process_tree from sglang.test.few_shot_gsm8k import run_eval from sglang.test.test_utils import ( @@ -64,7 +62,7 @@ def test_gsm8k(self): args = SimpleNamespace( num_shots=5, data_path=None, - num_questions=1319, + num_questions=200, max_new_tokens=512, parallel=16, host=f"http://{url.hostname}", @@ -76,20 +74,6 @@ def test_gsm8k(self): self.assertGreaterEqual(metrics["accuracy"], 0.80) self.assertGreaterEqual(metrics["output_throughput"], 500) - def run_decode(self, max_new_tokens): - response = requests.post( - self.base_url + "/generate", - json={ - "text": "The capital of France is", - "sampling_params": { - "temperature": 0, - "max_new_tokens": max_new_tokens, - }, - "ignore_eos": True, - }, - ) - return response.json() - def test_throughput(self): max_tokens = 256 From c190ea30a66aadcb7de947f8b83fe90bbe569d1d Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Sun, 4 Jan 2026 15:02:37 +0300 Subject: [PATCH 165/175] Return w4a4 to A3 --- test/srt/ascend/test_ascend_w4a4_quantization.py | 8 ++++---- test/srt/run_suite.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index 420ffd113bbf..0a287e869400 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -20,7 +20,7 @@ ) if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0" + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3" DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( 7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 ) @@ -43,7 +43,7 @@ def setUpClass(cls): "--attention-backend", "ascend", "--tp-size", - "1", + "4", "--mem-fraction-static", "0.8", "--cuda-graph-bs", @@ -72,7 +72,7 @@ def test_gsm8k(self): print(metrics) self.assertGreaterEqual(metrics["accuracy"], 0.80) - self.assertGreaterEqual(metrics["output_throughput"], 500) + self.assertGreaterEqual(metrics["output_throughput"], 1000) def test_throughput(self): max_tokens = 256 @@ -85,7 +85,7 @@ def test_throughput(self): print(f"Throughput: {throughput} tokens/s") if is_in_ci(): - self.assertGreaterEqual(throughput, 15) + self.assertGreaterEqual(throughput, 35) if __name__ == "__main__": diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 233f0d9604e0..9cc7b17ba30f 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -355,7 +355,6 @@ TestFile("ascend/test_ascend_sampling_backend.py", 400), TestFile("ascend/test_ascend_tp1_bf16.py", 400), TestFile("ascend/test_ascend_compile_graph_tp1_bf16.py", 400), - TestFile("ascend/test_ascend_w4a4_quantization.py", 400), TestFile("ascend/test_ascend_w8a8_quantization.py", 400), TestFile("test_embed_interpolate_unittest.py", 400), ], @@ -373,6 +372,7 @@ "per-commit-16-npu-a3": [ TestFile("ascend/test_ascend_deepep.py", 400), TestFile("ascend/test_ascend_deepseek_mtp.py", 400), + TestFile("ascend/test_ascend_w4a4_quantization.py", 400), ], } From 659fa074b1d49307dec31515eb32cd5cd5aa5252 Mon Sep 17 00:00:00 2001 From: OrangeRedeng Date: Sun, 4 Jan 2026 15:42:53 +0300 Subject: [PATCH 166/175] Remove unused is_npu() --- python/sglang/srt/configs/model_config.py | 3 +-- python/sglang/srt/layers/quantization/__init__.py | 11 ++--------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 327ef0466bcd..4c08ce5eace7 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -26,7 +26,7 @@ from sglang.srt.environ import envs from sglang.srt.layers.quantization import QUANTIZATION_METHODS from sglang.srt.server_args import ServerArgs -from sglang.srt.utils import is_hip, is_npu, retry +from sglang.srt.utils import is_hip, retry from sglang.srt.utils.hf_transformers_utils import ( get_config, get_context_length, @@ -37,7 +37,6 @@ from sglang.utils import is_in_ci logger = logging.getLogger(__name__) -_is_npu = is_npu() class AttentionArch(IntEnum): diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index 161301d6124e..ba9755c6a04a 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -31,6 +31,7 @@ def override_quantization_method(self, *args, **kwargs): ModelOptFp4Config, ModelOptFp8Config, ) +from sglang.srt.layers.quantization.modelslim.modelslim import ModelSlimConfig from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config from sglang.srt.layers.quantization.mxfp4 import Mxfp4Config from sglang.srt.layers.quantization.petit import PetitNvFp4Config @@ -68,6 +69,7 @@ def override_quantization_method(self, *args, **kwargs): "fbgemm_fp8": FBGEMMFp8Config, "quark": QuarkConfig, "auto-round": AutoRoundConfig, + "modelslim": ModelSlimConfig, } @@ -78,15 +80,6 @@ def override_quantization_method(self, *args, **kwargs): } ) -if is_npu(): - from sglang.srt.layers.quantization.modelslim.modelslim import ModelSlimConfig - - BASE_QUANTIZATION_METHODS.update( - { - "modelslim": ModelSlimConfig, - } - ) - QUANTIZATION_METHODS = {**BASE_QUANTIZATION_METHODS} From 4716b7320f80ed908ed721fd2c04aaf7432ad7d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 12 Jan 2026 12:34:41 +0300 Subject: [PATCH 167/175] Update test_ascend_w4a4_quantization.py --- test/srt/ascend/test_ascend_w4a4_quantization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index 0a287e869400..5e8b729966f2 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -47,7 +47,7 @@ def setUpClass(cls): "--mem-fraction-static", "0.8", "--cuda-graph-bs", - "16", + "64", "--disable-radix-cache", ], ) @@ -62,9 +62,9 @@ def test_gsm8k(self): args = SimpleNamespace( num_shots=5, data_path=None, - num_questions=200, + num_questions=1319, max_new_tokens=512, - parallel=16, + parallel=64, host=f"http://{url.hostname}", port=int(url.port), ) From 42d849e9c08d1d148c0a6a3ad8dbd69f103b59b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 12 Jan 2026 14:56:22 +0300 Subject: [PATCH 168/175] Fix test_ascend_piecewise_graph_prefill test --- test/srt/ascend/test_ascend_piecewise_graph_prefill.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/srt/ascend/test_ascend_piecewise_graph_prefill.py b/test/srt/ascend/test_ascend_piecewise_graph_prefill.py index 9e43ca60f74b..13c9d991a0bb 100644 --- a/test/srt/ascend/test_ascend_piecewise_graph_prefill.py +++ b/test/srt/ascend/test_ascend_piecewise_graph_prefill.py @@ -38,7 +38,7 @@ def setUpClass(cls): 128, "--enable-piecewise-cuda-graph", "--piecewise-cuda-graph-tokens", - TOKENS_TO_CAPTURE, + *TOKENS_TO_CAPTURE, ], ) @@ -79,7 +79,7 @@ def test_latency(self): "ascend", "--enable-piecewise-cuda-graph", "--piecewise-cuda-graph-tokens", - TOKENS_TO_CAPTURE, + *TOKENS_TO_CAPTURE, ], ) self.assertLess(prefill_latency, EXP_PREFILL_LATENCY) From 9a95ff824ecbb65b7f22355339ebf25e506e352e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 11:06:19 +0300 Subject: [PATCH 169/175] Move w4a4 test to A2 --- test/srt/run_suite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 122e4cf8ceea..6d146c7cbc76 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -158,6 +158,7 @@ TestFile("test_embed_interpolate_unittest.py", 400), ], "per-commit-2-npu-a2": [ + TestFile("ascend/test_ascend_w4a4_quantization.py", 400), TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400), TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400), TestFile("ascend/test_ascend_tp2_bf16.py", 400), @@ -171,7 +172,6 @@ "per-commit-16-npu-a3": [ TestFile("ascend/test_ascend_deepep.py", 400), TestFile("ascend/test_ascend_deepseek_mtp.py", 400), - TestFile("ascend/test_ascend_w4a4_quantization.py", 400), ], } From d323c6a12e79d93ddd399184d5dcd14e248843fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 11:07:18 +0300 Subject: [PATCH 170/175] Update test_ascend_w4a4_quantization.py --- test/srt/ascend/test_ascend_w4a4_quantization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index 5e8b729966f2..e838c87f9612 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -20,7 +20,7 @@ ) if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3" + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( 7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 ) @@ -43,7 +43,7 @@ def setUpClass(cls): "--attention-backend", "ascend", "--tp-size", - "4", + "2", "--mem-fraction-static", "0.8", "--cuda-graph-bs", From 7e3d2815cc310dde4212a3f72e6c20439f89e154 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 12:00:40 +0300 Subject: [PATCH 171/175] Update run_suite.py --- test/srt/run_suite.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 6d146c7cbc76..581a1e09c6a2 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -158,7 +158,6 @@ TestFile("test_embed_interpolate_unittest.py", 400), ], "per-commit-2-npu-a2": [ - TestFile("ascend/test_ascend_w4a4_quantization.py", 400), TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400), TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400), TestFile("ascend/test_ascend_tp2_bf16.py", 400), @@ -170,8 +169,9 @@ TestFile("ascend/test_ascend_tp4_bf16.py", 400), ], "per-commit-16-npu-a3": [ - TestFile("ascend/test_ascend_deepep.py", 400), - TestFile("ascend/test_ascend_deepseek_mtp.py", 400), + TestFile("ascend/test_ascend_deepep.py", 3600), + TestFile("ascend/test_ascend_deepseek_mtp.py", 2800), + TestFile("ascend/test_ascend_w4a4_quantization.py", 600), ], } From 601a349a339fc13f95e0442566d1a740d382a9b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 12:00:58 +0300 Subject: [PATCH 172/175] Update test_ascend_w4a4_quantization.py --- test/srt/ascend/test_ascend_w4a4_quantization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index e838c87f9612..5e8b729966f2 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -20,7 +20,7 @@ ) if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3" DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( 7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 ) @@ -43,7 +43,7 @@ def setUpClass(cls): "--attention-backend", "ascend", "--tp-size", - "2", + "4", "--mem-fraction-static", "0.8", "--cuda-graph-bs", From 0d16e53ba6cce3ee38d6b21fef25b1c9ab5beb4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 14:59:50 +0300 Subject: [PATCH 173/175] Update test_ascend_w4a4_quantization.py --- test/srt/ascend/test_ascend_w4a4_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index 5e8b729966f2..7c5e33547371 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -72,7 +72,7 @@ def test_gsm8k(self): print(metrics) self.assertGreaterEqual(metrics["accuracy"], 0.80) - self.assertGreaterEqual(metrics["output_throughput"], 1000) + self.assertGreaterEqual(metrics["output_throughput"], 1050) def test_throughput(self): max_tokens = 256 From 7b9e6143c3034a3ddeaf372e506993452b241bb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 17:47:25 +0300 Subject: [PATCH 174/175] Fix w4a4 test --- test/srt/ascend/test_ascend_w4a4_quantization.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index 7c5e33547371..ee56f5b9825f 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -72,7 +72,21 @@ def test_gsm8k(self): print(metrics) self.assertGreaterEqual(metrics["accuracy"], 0.80) - self.assertGreaterEqual(metrics["output_throughput"], 1050) + self.assertGreaterEqual(metrics["output_throughput"], 1000) + + def run_decode(self, max_new_tokens): + response = requests.post( + self.base_url + "/generate", + json={ + "text": "The capital of France is", + "sampling_params": { + "temperature": 0, + "max_new_tokens": max_new_tokens, + }, + "ignore_eos": True, + }, + ) + return response.json() def test_throughput(self): max_tokens = 256 From a79e4b9ae5b1af20993242c4199ea8bc43e175c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 18:24:15 +0300 Subject: [PATCH 175/175] Fix w4a4 test --- test/srt/ascend/test_ascend_w4a4_quantization.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py index ee56f5b9825f..22d3f0615181 100644 --- a/test/srt/ascend/test_ascend_w4a4_quantization.py +++ b/test/srt/ascend/test_ascend_w4a4_quantization.py @@ -9,6 +9,8 @@ from types import SimpleNamespace from urllib.parse import urlparse +import requests + from sglang.srt.utils import kill_process_tree from sglang.test.few_shot_gsm8k import run_eval from sglang.test.test_utils import (