From dcea881b2254b9cc29a3f26f64dba57384d1acbc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 5 Dec 2025 16:56:40 +0300
Subject: [PATCH 001/175] Automatic quant_model_description.json detection
 support

---
 python/sglang/srt/configs/model_config.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 26dfbe5eb1d5..e4af64b1a116 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -16,6 +16,7 @@
 import logging
 import math
 import os
+from pathlib import Path
 from enum import Enum, IntEnum, auto
 from typing import Any, List, Optional, Set, Union
 
@@ -36,7 +37,7 @@
 from sglang.utils import is_in_ci
 
 logger = logging.getLogger(__name__)
-
+_is_npu = is_npu()
 
 class AttentionArch(IntEnum):
     MLA = auto()
@@ -560,6 +561,16 @@ def _parse_quant_hf_config(self):
                 quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
         return quant_cfg
 
+    def _find_quant_modelslim_config(self):
+        quant_config_file = Path(self.model_path, "quant_model_description.json")
+        if quant_config_file.is_file():
+            with open(quant_config_file) as f:
+                quant_cfg = json.load(f)
+        else:
+            quant_cfg = None
+                            
+        return quant_cfg
+
     def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]:
         """Parse ModelOpt quantization config and return the appropriate quant_method."""
         json_quant_configs = quant_config_dict["quantization"]
@@ -678,6 +689,9 @@ def _verify_quantization(self) -> None:
 
         # Parse quantization method from the HF model config, if available.
         quant_cfg = self._parse_quant_hf_config()
+        if _is_npu:
+            quant_cfg = self._find_quant_modelslim_config()
+            self.quantization = 'modelslim'
 
         if quant_cfg is not None:
             quant_method = quant_cfg.get(

From aa0a0aa4e6171a9de10c459246dd60162013ae02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 5 Dec 2025 17:25:20 +0300
Subject: [PATCH 002/175] Add w4a4 support

---
 .../hardware_backend/npu/quantization/w4a4.py | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a4.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py
new file mode 100644
index 000000000000..e1d7a8b8a6cb
--- /dev/null
+++ b/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py
@@ -0,0 +1,35 @@
+class NPU_W4A4DynamicLinearMethodImpl:
+    """Linear method for NPU W4A4_DYNAMIC."""
+
+    def __init__(self):
+        self.transpose_weight = True
+
+    @staticmethod
+    def apply(
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+        tp_rank: Optional[int] = 0,
+    ) -> torch.Tensor:
+        original_dtype = x.dtype
+        quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(
+            x, dst_type=torch.quint4x2
+        )
+        return torch_npu.npu_quant_matmul(
+            quant_out,
+            layer.weight,
+            layer.weight_scale,
+            pertoken_scale=dynamic_scale,
+            bias=bias,
+            output_dtype=original_dtype,
+        )
+
+    def process_weights_after_loading(self, layer):
+        if self.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
+        layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32)
+        layer.weight_offset.data = layer.weight_offset.data.flatten()
+        layer.weight.data = torch_npu.npu_convert_weight_to_int4pack(
+            layer.weight.data.to(torch.int32)
+        )

From 6c845aded27617bd937286e714d18d7c8560a5cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 5 Dec 2025 17:32:28 +0300
Subject: [PATCH 003/175] Refactor w8a8

---
 .../hardware_backend/npu/quantization/w8a8.py | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w8a8.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py
new file mode 100644
index 000000000000..f9ad7f4a16ac
--- /dev/null
+++ b/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py
@@ -0,0 +1,100 @@
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.hardware_backend.npu.quantization.utils import _NPULinearMethodBase
+
+class NPUW8A8Int8LinearMethod(_NPULinearMethodBase):
+    """Linear method for NPU W8A8."""
+  
+    def __init__(self):
+        self.transpose_weight = True
+
+    @staticmethod
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from sglang.srt.layers.linear import RowParallelLinear
+
+        original_dtype = x.dtype
+        if original_dtype != torch.int8:
+            x = torch.ops.npu.npu_quantize(
+                x,
+                layer.aclnn_input_scale_reciprocal,
+                layer.aclnn_input_offset,
+                torch.qint8,
+                -1,
+                False,
+            )
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in Attention TP>1 case)
+        if isinstance(layer, RowParallelLinear) and layer.tp_rank > 0:
+            quant_bias = None
+        else:
+            quant_bias = layer.quant_bias
+        return torch.ops.npu.npu_quant_matmul(
+            x,
+            layer.weight,
+            layer.deq_scale,
+            bias=quant_bias,
+            output_dtype=original_dtype,
+        )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        if self.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight.data = npu_format_cast(layer.weight.data)
+
+        layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
+        layer.weight_offset.data = torch.flatten(layer.weight_offset.data)
+
+        expanding_factor = layer.weight.data.shape[0]
+        layer.aclnn_input_scale = torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+        layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+        layer.aclnn_input_offset = torch.nn.Parameter(
+            layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+
+
+class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase):
+    """Linear method for NPU W8A8_DYNAMIC."""
+  
+    def __init__(self):
+        self.transpose_weight = True
+
+    @staticmethod
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        original_dtype = x.dtype
+        quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant(x)
+        return torch.ops.npu.npu_quant_matmul(
+            quant_out,
+            layer.weight,
+            layer.weight_scale,
+            pertoken_scale=dynamic_scale,
+            bias=bias,
+            output_dtype=original_dtype,
+        )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        if self.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight.data = npu_format_cast(layer.weight.data)
+
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
+        layer.weight_offset.data = layer.weight_offset.data.flatten()

From dee644b2946df18006dfad370a022cf5b60beb91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 5 Dec 2025 17:34:04 +0300
Subject: [PATCH 004/175] Add import section

---
 .../sglang/srt/hardware_backend/npu/quantization/w4a4.py   | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py
index e1d7a8b8a6cb..4676b4655872 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py
@@ -1,3 +1,10 @@
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.hardware_backend.npu.quantization.utils import _NPULinearMethodBase
+
 class NPU_W4A4DynamicLinearMethodImpl:
     """Linear method for NPU W4A4_DYNAMIC."""
 

From 35b8983a65b28a51eaca5333bade407716ef97eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 5 Dec 2025 17:35:26 +0300
Subject: [PATCH 005/175] Create quantization utils file

---
 .../hardware_backend/npu/quantization/utils.py    | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/utils.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/utils.py b/python/sglang/srt/hardware_backend/npu/quantization/utils.py
new file mode 100644
index 000000000000..0350d85e6400
--- /dev/null
+++ b/python/sglang/srt/hardware_backend/npu/quantization/utils.py
@@ -0,0 +1,15 @@
+from typing import TYPE_CHECKING, List, Optional
+
+from sglang.srt.layers.quantization.base_config import LinearMethodBase
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
+
+class _NPULinearMethodBase(LinearMethodBase):
+
+    def __init__(
+        self,
+        quant_config: Optional["QuantizationConfig"] = None,
+    ):
+        super().__init__()
+        self.quant_config = quant_config

From 311cc288153fc00c65d0766287a63cbde5988347 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 5 Dec 2025 17:40:05 +0300
Subject: [PATCH 006/175] Create w4a16

---
 .../npu/quantization/w4a16.py                 | 195 ++++++++++++++++++
 1 file changed, 195 insertions(+)
 create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a16.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a16.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a16.py
new file mode 100644
index 000000000000..2f3f2a4539f3
--- /dev/null
+++ b/python/sglang/srt/hardware_backend/npu/quantization/w4a16.py
@@ -0,0 +1,195 @@
+from typing import TYPE_CHECKING
+
+import numpy as np
+import torch
+
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+
+class NPUW4A8Int4DynamicMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self) -> None:
+        self.group_size = 256
+        self.tp_size = 1
+
+    def pack_to_int32(self, weight: torch.Tensor):
+        assert weight.dim() == 3
+        if weight.dtype == torch.int32:
+            # pack 8 int4 to int32, we use a int32 to represent a int4
+            assert (
+                weight.shape[-1] % 8 == 0
+            ), "the last dim of weight needs to be divided by 8"
+            new_weight = torch.ops.npu.npu_convert_weight_to_int4pack(
+                weight.flatten(0, 1)
+            )
+            new_weight = new_weight.view(weight.shape[0], weight.shape[1], -1)
+        elif weight.dtype == torch.int8:
+            # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
+            assert (
+                weight.shape[-1] % 4 == 0
+            ), "the last dim of weight needs to be divided by 4"
+            new_weight = weight.view(torch.int32).contiguous()
+        else:
+            raise ValueError(f"{weight.dtype=} is not supported !")
+        return new_weight
+
+    def unpack_from_int32(
+        self,
+        value: torch.Tensor,
+        num_bits: int,
+        shape: torch.Size = None,
+        packed_dim=1,
+    ) -> torch.Tensor:
+        """
+        Unpacks a tensor of packed int32 weights into individual int8s, maintaining the
+        original bit range.
+
+        Return tensors in int8
+
+        :param value: tensor to unpack
+        :param num_bits: number of bits to unpack each data point into
+        :param shape: shape to unpack into, used to remove padding
+        :returns: unpacked int8 tensor
+        """
+        if value.dtype is not torch.int32:
+            raise ValueError(
+                f"Expected {torch.int32} but got {value.dtype}, Aborting unpack."
+            )
+
+        if num_bits > 8:
+            raise ValueError("Unpacking is only supported for less than 8 bits")
+
+        pack_factor = 32 // num_bits
+
+        # unpack
+        mask = (1 << num_bits) - 1
+
+        if packed_dim == 1:
+            unpacked = torch.zeros(
+                (value.shape[0], value.shape[1] * pack_factor),
+                device=value.device,
+                dtype=torch.int32,
+            )
+            for i in range(pack_factor):
+                unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
+
+            # remove padding
+            if shape is not None:
+                original_row_size = int(shape[1])
+                unpacked = unpacked[:, :original_row_size]
+        else:
+            unpacked = torch.zeros(
+                (value.shape[0] * pack_factor, value.shape[1]),
+                device=value.device,
+                dtype=torch.int32,
+            )
+            for i in range(pack_factor):
+                unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask
+
+            # remove padding
+            original_row_size = int(shape[0])
+            unpacked = unpacked[:original_row_size, :]
+
+        # bits are packed in unsigned format, reformat to signed
+        # update the value range from unsigned to signed
+        offset = pow(2, num_bits) // 2
+        unpacked = (unpacked - offset).to(torch.int8)
+
+        return unpacked
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        w13_weight_scale = layer.w13_weight_scale.data.transpose(-1, -2).contiguous()
+        w2_weight_scale = layer.w2_weight_scale.data.transpose(-1, -2).contiguous()
+        layer.w13_weight_scale = torch.nn.Parameter(
+            w13_weight_scale, requires_grad=False
+        )
+        layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, requires_grad=False)
+
+        layer.w13_weight_offset = torch.nn.Parameter(
+            layer.w13_weight_offset.data.transpose(-1, -2).contiguous(),
+            requires_grad=False,
+        )
+        layer.w2_weight_offset = torch.nn.Parameter(
+            layer.w2_weight_offset.data.transpose(-1, -2).contiguous(),
+            requires_grad=False,
+        )
+
+        # w = [n, k // 8]  --> [k, n // 8]
+        # w13_weight = layer.w13_weight.data.transpose(1, 2).contiguous()
+        # w2_weight = layer.w2_weight.data.transpose(1, 2).contiguous()
+        unpacked_w13_weight = (
+            self.unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4)
+            .view(layer.w13_weight.data.shape[0], layer.w13_weight.data.shape[1], -1)
+            .transpose(1, 2)
+            .contiguous()
+            .int()
+        )
+        unpacked_w2_weight = (
+            self.unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4)
+            .view(layer.w2_weight.data.shape[0], layer.w2_weight.data.shape[1], -1)
+            .transpose(1, 2)
+            .contiguous()
+            .int()
+        )
+
+        w13_weight = self.pack_to_int32(unpacked_w13_weight)
+        w2_weight = self.pack_to_int32(unpacked_w2_weight)
+
+        layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+        layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer,
+        dispatch_output: "StandardDispatchOutput",
+    ) -> "CombineInput":
+        # FIXME W4A8 only support with deepep
+        raise NotImplementedError(
+            f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep"
+        )
+
+    def apply_without_routing_weights(
+        self,
+        layer,
+        hidden_states,
+        hidden_states_scale,
+        group_list_type,
+        group_list,
+        output_dtype,
+    ):
+        hidden_states = torch.ops.npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[self.w13_weight],
+            scale=[self.w13_weight_scale],
+            bias=[self.w13_scale_bias],
+            per_token_scale=[hidden_states_scale],
+            group_list=group_list,
+            split_item=2,
+            group_type=0,
+            group_list_type=group_list_type,
+            output_dtype=output_dtype,
+        )[0]
+
+        # act_fn: swiglu
+        hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
+        hidden_states, swiglu_out_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
+
+        hidden_states = torch.ops.npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[self.w2_weight],
+            scale=[self.w2_weight_scale],
+            bias=[self.w2_scale_bias],
+            per_token_scale=[swiglu_out_scale],
+            group_list=group_list,
+            split_item=2,
+            group_type=0,
+            group_list_type=group_list_type,
+            output_dtype=output_dtype,
+        )[0]
+
+        return hidden_states

From 6869ebf700f54aeb9b26ff65744b6904b0052a1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 5 Dec 2025 17:43:02 +0300
Subject: [PATCH 007/175] Create w4a8.py

---
 .../hardware_backend/npu/quantization/w4a8.py | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a8.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py
new file mode 100644
index 000000000000..3696c4d36380
--- /dev/null
+++ b/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py
@@ -0,0 +1,148 @@
+from typing import TYPE_CHECKING
+
+import numpy as np
+import torch
+
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+
+class NPUW4A8Int4DynamicMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self) -> None:
+        self.group_size = 256
+        self.tp_size = 1
+
+    def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
+        scale = scale.transpose(1, 2).contiguous()
+        per_group_scale = per_group_scale.transpose(1, 2).contiguous()
+        group_num, k, n = weight.shape
+        # the weight of the new version is reduced by half by pack n, so it needs to be restored
+        n = n * 2
+        per_group_scale = per_group_scale.reshape(group_num, -1, n)
+        group_num, quantgroup_num, n = per_group_scale.shape
+        bias = None
+
+        scale_fp32 = (scale * per_group_scale).to(torch.float16).to(torch.float32)
+        scale_fp32_np = scale_fp32.cpu().numpy()
+        scale_fp32_np.dtype = np.uint32
+        sscale_uint64 = np.zeros((group_num, quantgroup_num, n * 2), dtype=np.uint32)
+
+        sscale_uint64[..., ::2] = scale_fp32_np
+
+        sscale_uint64_buffer = np.frombuffer(
+            sscale_uint64.tobytes(), dtype=np.int64
+        ).copy()
+        sscale_uint64_tensor = torch.from_numpy(sscale_uint64_buffer).reshape(
+            group_num, quantgroup_num, n
+        )
+        sscale_uint64_tensor = sscale_uint64_tensor.npu()
+        return sscale_uint64_tensor, bias
+
+    def update_bias(self, layer, w13_bias, w2_bias):
+        layer.w13_scale_bias.data = (
+            layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
+        )
+        layer.w2_scale_bias.data = (
+            layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
+        )
+
+    def pack_to_int32(self, weight: torch.Tensor):
+        # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
+        assert (
+            weight.shape[-1] % 4 == 0
+        ), "the last dim of weight needs to be divided by 4"
+        return weight.view(torch.int32).contiguous()
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.w13_weight = torch.nn.Parameter(
+            layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False
+        )
+        layer.w2_weight = torch.nn.Parameter(
+            layer.w2_weight.data.transpose(1, 2).contiguous(), requires_grad=False
+        )
+
+        w13_weight_scale_second = (
+            layer.w13_weight_scale_second.data
+            if hasattr(layer, "w13_weight_scale_second")
+            else None
+        )
+        w2_weight_scale_second = (
+            layer.w2_weight_scale_second.data
+            if hasattr(layer, "w2_weight_scale_second")
+            else None
+        )
+        layer.w13_weight_scale.data, w13_bias = self.process_scale(
+            layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second
+        )
+        layer.w2_weight_scale.data, w2_bias = self.process_scale(
+            layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second
+        )
+        if hasattr(layer, "w13_weight_scale_second"):
+            # scale_second is no longer used, release this part of the memory
+            del layer.w13_weight_scale_second
+            del layer.w2_weight_scale_second
+            del layer.w13_weight_offset_second
+            del layer.w2_weight_offset_second
+
+        self.update_bias(layer, w13_bias, w2_bias)
+
+        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
+        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
+        layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
+        layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer,
+        dispatch_output: "StandardDispatchOutput",
+    ) -> "CombineInput":
+        # FIXME W4A8 only support with deepep
+        raise NotImplementedError(
+            f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep"
+        )
+
+    def apply_without_routing_weights(
+        self,
+        layer,
+        hidden_states,
+        hidden_states_scale,
+        group_list_type,
+        group_list,
+        output_dtype,
+    ):
+        hidden_states = torch.ops.npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[self.w13_weight],
+            scale=[self.w13_weight_scale],
+            bias=[self.w13_scale_bias],
+            per_token_scale=[hidden_states_scale],
+            group_list=group_list,
+            split_item=2,
+            group_type=0,
+            group_list_type=group_list_type,
+            output_dtype=output_dtype,
+        )[0]
+
+        # act_fn: swiglu
+        hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
+        hidden_states, swiglu_out_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
+
+        hidden_states = torch.ops.npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[self.w2_weight],
+            scale=[self.w2_weight_scale],
+            bias=[self.w2_scale_bias],
+            per_token_scale=[swiglu_out_scale],
+            group_list=group_list,
+            split_item=2,
+            group_type=0,
+            group_list_type=group_list_type,
+            output_dtype=output_dtype,
+        )[0]
+
+        return hidden_states

From c7d6dd5c521db342e9ebed6e07317d4f4c4ee53f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 5 Dec 2025 17:46:38 +0300
Subject: [PATCH 008/175] Rename w4a16.py to w4a16_moe.py

---
 .../hardware_backend/npu/quantization/{w4a16.py => w4a16_moe.py}  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename python/sglang/srt/hardware_backend/npu/quantization/{w4a16.py => w4a16_moe.py} (100%)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a16.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py
similarity index 100%
rename from python/sglang/srt/hardware_backend/npu/quantization/w4a16.py
rename to python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py

From 7ffe0f62314626950bb3385c18434cbe51613c41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 5 Dec 2025 17:46:48 +0300
Subject: [PATCH 009/175] Rename w4a8.py to w4a8_moe.py

---
 .../hardware_backend/npu/quantization/{w4a8.py => w4a8_moe.py}    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename python/sglang/srt/hardware_backend/npu/quantization/{w4a8.py => w4a8_moe.py} (100%)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py
similarity index 100%
rename from python/sglang/srt/hardware_backend/npu/quantization/w4a8.py
rename to python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py

From e2d8889cbcf704e95775ac398c5622855262364d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 5 Dec 2025 17:49:14 +0300
Subject: [PATCH 010/175] Create w8a8_moe

---
 .../npu/quantization/w8a8_moe.py              | 215 ++++++++++++++++++
 1 file changed, 215 insertions(+)
 create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py b/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py
new file mode 100644
index 000000000000..789e5b516ced
--- /dev/null
+++ b/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py
@@ -0,0 +1,215 @@
+from typing import TYPE_CHECKING
+
+import numpy as np
+import torch
+
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+
+def npu_fused_experts(
+    hidden_states: torch.Tensor,
+    w13: torch.Tensor,
+    w13_scale: torch.Tensor,
+    w2: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    **kwargs,
+):
+    w13_offset = kwargs.get("w13_offset", None)
+    w2_offset = kwargs.get("w2_offset", None)
+    use_wna16 = kwargs.get("use_wna16", False)
+
+    original_shape = hidden_states.shape
+    original_dtype = hidden_states.dtype
+    scale_dtype = original_dtype if original_dtype == torch.bfloat16 else torch.float32
+    if len(original_shape) == 3:
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+    num_tokens = hidden_states.shape[0]
+    num_experts = w13.shape[0]
+    row_idx_len = num_tokens * top_k
+    row_idx = (
+        torch.arange(0, row_idx_len, dtype=torch.int32, device=topk_weights.device)
+        .view(top_k, -1)
+        .permute(1, 0)
+        .contiguous()
+    )
+    hidden_states, expanded_row_idx, expanded_expert_idx = (
+        torch.ops.npu.npu_moe_init_routing(
+            hidden_states, row_idx=row_idx, expert_idx=topk_ids, active_num=num_tokens
+        )
+    )
+    expert_tokens = torch.ops.npu.npu_moe_compute_expert_tokens(
+        expanded_expert_idx, num_experts
+    )
+    expert_tokens = expert_tokens.to(torch.int64)
+    # gmm1: gate_up_proj
+    if not use_wna16:
+        hidden_states, pertoken_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
+        scale_args13 = {
+            "scale": [w13_scale.to(scale_dtype)],
+            "per_token_scale": [pertoken_scale],
+        }
+    else:
+        scale_args13 = {
+            "antiquant_scale": [w13_scale],
+            "antiquant_offset": [w13_offset],
+        }
+
+    hidden_states = torch.ops.npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w13],
+        **scale_args13,
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=expert_tokens,
+        output_dtype=original_dtype,
+    )[0]
+    # act_fn: swiglu
+    hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
+    if not use_wna16:
+        hidden_states, pertoken_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
+
+        scale_args2 = {
+            "scale": [w2_scale.to(scale_dtype)],
+            "per_token_scale": [pertoken_scale],
+        }
+    else:
+        scale_args2 = {"antiquant_scale": [w2_scale], "antiquant_offset": [w2_offset]}
+    # gmm2: down_proj
+    hidden_states = torch.ops.npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w2],
+        **scale_args2,
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=expert_tokens,
+        output_dtype=original_dtype,
+    )[0]
+
+    final_hidden_states = torch.ops.npu.npu_moe_finalize_routing(
+        hidden_states,
+        skip1=None,
+        skip2=None,
+        bias=None,
+        scales=topk_weights,
+        expanded_src_to_dst_row=expanded_row_idx,
+        export_for_source_row=topk_ids,
+    )
+    if len(original_shape) == 3:
+        final_hidden_states = final_hidden_states.view(original_shape)
+    return final_hidden_states
+
+class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase):
+
+    ### TODO remove this ###
+    def release_weight_cache(self, weight: torch.Tensor):
+        # .contiguous() introduces additional memory overhead and needs to be released using resize_(0)
+        origin_weight = weight.data.transpose(1, 2)
+        new_weight = origin_weight.contiguous()
+        origin_weight.untyped_storage().resize_(0)
+        return new_weight
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight_data = self.release_weight_cache(layer.w13_weight.data)
+        layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False)
+
+        weight_data = self.release_weight_cache(layer.w2_weight.data)
+        layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False)
+
+        layer.w13_weight_scale = torch.nn.Parameter(
+            layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32),
+            requires_grad=False,
+        )
+        layer.w2_weight_scale = torch.nn.Parameter(
+            layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+        layer.w13_weight_offset = torch.nn.Parameter(
+            layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+        layer.w2_weight_offset = torch.nn.Parameter(
+            layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+
+        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
+        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer,
+        dispatch_output: "StandardDispatchOutput",
+    ) -> "CombineInput":
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        topk_weights, topk_ids, _ = topk_output
+        topk_ids = topk_ids.to(torch.int32)
+        topk_weights = topk_weights.to(x.dtype)
+        output = npu_fused_experts(
+            hidden_states=x,
+            w13=layer.w13_weight,
+            w13_scale=layer.w13_weight_scale,
+            w2=layer.w2_weight,
+            w2_scale=layer.w2_weight_scale,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            top_k=topk_ids.shape[1],
+        )
+        return StandardCombineInput(hidden_states=output)
+
+    def apply_without_routing_weights(
+        self,
+        layer,
+        hidden_states,
+        hidden_states_scale,
+        group_list_type,
+        group_list,
+        output_dtype,
+    ):
+        # gmm1: gate_up_proj
+        hidden_states = torch.ops.npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[layer.w13_weight],
+            split_item=2,
+            group_list_type=group_list_type,
+            group_type=0,
+            group_list=group_list,
+            output_dtype=torch.int32,
+        )[0]
+
+        # act_fn: swiglu
+        hidden_states, swiglu_out_scale = torch.ops.npu.npu_dequant_swiglu_quant(
+            x=hidden_states,
+            weight_scale=layer.w13_weight_scale,
+            activation_scale=hidden_states_scale,
+            bias=None,
+            quant_scale=None,
+            quant_offset=None,
+            group_index=group_list,
+            activate_left=True,
+            quant_mode=1,
+        )
+
+        # gmm2: down_proj
+        hidden_states = torch.ops.npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[layer.w2_weight],
+            scale=[layer.w2_weight_scale.to(output_dtype)],
+            per_token_scale=[swiglu_out_scale],
+            split_item=2,
+            group_list_type=group_list_type,
+            group_type=0,
+            group_list=group_list,
+            output_dtype=output_dtype,
+        )[0]
+        return hidden_states

From 41d3d3f8500e1362aeff20c51bc8ab80cf3cfe2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 5 Dec 2025 17:54:12 +0300
Subject: [PATCH 011/175] Create w4a8.py

---
 .../hardware_backend/npu/quantization/w4a8.py | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a8.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py
new file mode 100644
index 000000000000..7cd4dc81486a
--- /dev/null
+++ b/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py
@@ -0,0 +1,119 @@
+from __future__ import annotations
+
+import importlib
+import sys
+from types import MappingProxyType
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+    cast,
+)
+
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.w8a8_int8 import NPU_W8A8DynamicLinearMethod
+from sglang.srt.utils import (
+    apply_module_patch,
+    cpu_has_amx_support,
+    is_cpu,
+    is_cuda,
+    is_npu,
+    set_weight_attrs,
+    use_intel_amx_backend,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+_is_cuda = is_cuda()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_is_npu = is_npu()
+
+if _is_npu:
+    import torch_npu
+
+class NPU_W4A8DynamicLinearMethod:
+    """Linear method for NPU W4A8_DYNAMIC."""
+
+    def __init__(self):
+        self.transpose_weight = True
+        try:
+            self.group_size = self.quantization_config.get("group_size", 256)
+        except AttributeError:
+            self.group_size = 256
+        
+    @staticmethod
+    def process_scale_second(weight: torch.Tensor, scale: torch.Tensor,
+                             per_group_scale: torch.Tensor):
+        k, n = weight.shape
+        group_num, n = per_group_scale.shape
+        weight_high = weight.to(torch.float32).reshape(
+            group_num, -1, n) * per_group_scale.reshape(group_num, 1, n)
+        weight_high = weight_high.reshape(k, n)
+        bias = 8 * (weight_high.to(torch.float32) * scale).sum(dim=0)
+        antiquant_scale = (scale * per_group_scale).reshape(group_num, n)
+        return antiquant_scale.npu(), bias
+    
+    @staticmethod
+    def apply(
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+        tp_rank: Optional[int] = 0,
+    ) -> torch.Tensor:
+        group_size = 256
+        return torch_npu.npu_weight_quant_batchmatmul(
+            x,
+            layer.weight,
+            antiquant_scale=layer.weight_scale_second.to(x.dtype),
+            antiquant_group_size=group_size,
+        )
+
+    def process_weights_after_loading(self, layer):
+        if self.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+
+        layer.weight_scale.data = layer.weight_scale.data.flatten().to(
+            torch.float32)
+        layer.weight_offset.data = layer.weight_offset.data.flatten()
+        layer.weight_scale_second.data, scale_bias = self.process_scale_second(
+            layer.weight.data,
+            layer.weight_scale.data,
+            layer.weight_scale_second.data.transpose(0, 1).contiguous(),
+        )
+        param = torch.nn.Parameter(scale_bias, requires_grad=False)
+        layer.register_parameter("weight_scale_bias", param)
+        layer.weight.data = torch_npu.npu_convert_weight_to_int4pack(
+            layer.weight.data.to(torch.int32))

From 6d0b035174c98d75abd2323211aa9012e2856c65 Mon Sep 17 00:00:00 2001
From: TamirBaydasov <mr.jeijy@gmail.com>
Date: Mon, 8 Dec 2025 05:20:59 +0300
Subject: [PATCH 012/175] Create msmodelslim structure, initial commit

---
 .../npu/quantization/modelslim.py             | 241 ----------
 .../quantization/msmodelslim/msmodelslim.py   | 426 ++++++++++++++++++
 .../msmodelslim/msmodelslim_moe.py            |  57 +++
 .../msmodelslim/schemes/__init__.py           |   0
 .../msmodelslim/schemes/msmodelslim_scheme.py |   0
 .../schemes/msmodelslim_w8a8_int8.py          |   0
 .../schemes/msmodelslim_w8a8_int8_moe.py      |   0
 7 files changed, 483 insertions(+), 241 deletions(-)
 create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
 create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
 create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
 create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py
 create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
 create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py b/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py
index aae78683686c..e69de29bb2d1 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py
@@ -1,241 +0,0 @@
-from __future__ import annotations
-
-from types import MappingProxyType
-from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
-
-import torch
-from compressed_tensors.quantization import QuantizationStrategy
-
-from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
-    NPUW4A8Int4DynamicMoEMethod,
-    NPUW4A16Int4DynamicMoEMethod,
-    NPUW8A8Int8DynamicMoEMethod,
-)
-from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
-    NPUW8A8Int8DynamicLinearMethod,
-    NPUW8A8Int8LinearMethod,
-)
-from sglang.srt.layers.quantization.base_config import (
-    QuantizationConfig,
-    QuantizeMethodBase,
-)
-from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
-    CompressedTensorsConfig,
-)
-from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
-from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
-from sglang.srt.utils import apply_module_patch
-
-
-# func refers to RMSNorm.__init__
-def npu_wrapper_rmsnorm_init(func):
-    def init(self, hidden_size: int, **extra_args) -> None:
-        func(self, hidden_size, **extra_args)
-        self.ignore_anti = True
-        # The Ascend w8a8_int8 quantization requires adding a bias in rmsnorm
-        self.bias = torch.nn.Parameter(torch.zeros(hidden_size), requires_grad=False)
-
-    return init
-
-
-# func refers to RMSNorm.forward_oot
-def npu_wrapper_rmsnorm_forward(func):
-    def _rmsnorm_forward_oot(
-        self,
-        x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        from sgl_kernel_npu.norm.add_rmsnorm_bias import add_rmsnorm_bias
-
-        if not x.is_contiguous():
-            x = x.contiguous()
-        if residual is not None:
-            out, residual_out = add_rmsnorm_bias(
-                x,
-                residual,
-                self.weight.data,
-                self.bias,
-                self.variance_epsilon,
-            )
-            return out.to(x.dtype), residual_out
-
-        out = torch.ops.npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0]
-        out = out + self.bias
-        return out.to(x.dtype)
-
-    return _rmsnorm_forward_oot
-
-
-class ModelSlimConfig(QuantizationConfig):
-    """
-    Config class for ModelSlim Quantization, a NPU-specific quantization type.
-    """
-
-    def __init__(self, quant_config: Dict[str, Any] = {}):
-        super().__init__()
-        self.quant_description = quant_config
-        self.is_dynamic = quant_config.get("is_dynamic", False)
-        self.is_moe_w4_dynamic = False
-        ignore = cast(List[str], quant_config.get("ignore", []))
-        self.ignore = ignore if ignore is not None else []
-        packed_modules_mapping = quant_config.get("packed_modules_mapping", {})
-        self.packed_modules_mapping = (
-            packed_modules_mapping if packed_modules_mapping is not None else {}
-        )
-        self.target_scheme_map = (
-            CompressedTensorsConfig._quantization_scheme_map_from_config(
-                config=quant_config
-            )
-        )
-        target = "MoEGMM" if "MoEGMM" in self.target_scheme_map else "Linear"
-        target_scheme = self.target_scheme_map.get(target, None)
-        if target_scheme is None:
-            self.is_moe_w4_dynamic = False
-        else:
-            weight_quant = target_scheme.get("weights")
-            input_quant = target_scheme.get("input_activations")
-            self.is_moe_w4_dynamic = self.is_dynamic_token_w4(weight_quant, input_quant)
-            self.is_moe_input_quant = input_quant
-
-        for name in self.quant_description.keys():
-            if "norm.bias" in name:
-                apply_module_patch(
-                    "sglang.srt.layers.layernorm.RMSNorm",
-                    "__init__",
-                    [npu_wrapper_rmsnorm_init],
-                )
-                apply_module_patch(
-                    "sglang.srt.layers.layernorm.RMSNorm",
-                    "forward_npu",
-                    [npu_wrapper_rmsnorm_forward],
-                )
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.int8, torch.float16, torch.bfloat16]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 0
-
-    @classmethod
-    def get_name(self) -> str:
-        return "modelslim"
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        filenames = ["quant_model_description.json"]
-        return filenames
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> ModelSlimConfig:
-        return cls(config)
-
-    def get_quant_method(
-        self,
-        layer: torch.nn.Module,
-        prefix: str,
-    ) -> Optional[QuantizeMethodBase]:
-        from sglang.srt.layers.linear import LinearBase
-        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
-
-        if isinstance(layer, LinearBase):
-            if should_ignore_layer(
-                prefix,
-                ignore=self.ignore,
-                fused_mapping=self.packed_modules_mapping,
-            ):
-                return UnquantizedLinearMethod()
-            key = "model"
-            if "vision_model" in prefix:
-                key = "vision_model"
-            elif "visual" in prefix:
-                key = "visual"
-            packed_modules_mapping_subset = self.packed_modules_mapping.get(key, {})
-            prefix_in_quant_config = prefix
-            proj_name = prefix.split(".")[-1]
-            if proj_name in packed_modules_mapping_subset:
-                prefix_in_quant_config = prefix.replace(
-                    proj_name, packed_modules_mapping_subset[proj_name][0]
-                )
-            self.is_dynamic = (
-                self.quant_description[prefix_in_quant_config + ".weight"]
-                == "W8A8_DYNAMIC"
-            )
-            if self.is_layer_skipped(prefix, packed_modules_mapping_subset):
-                return UnquantizedLinearMethod()
-            return (
-                NPUW8A8Int8DynamicLinearMethod(self)
-                if self.is_dynamic
-                else NPUW8A8Int8LinearMethod(self)
-            )
-        elif isinstance(layer, FusedMoE):
-            prefix_in_quant_config = prefix + ".0.down_proj.weight"
-            is_moe_w4a8_dynamic = (
-                self.quant_description.get(prefix_in_quant_config, "STATIC")
-                == "W4A8_DYNAMIC"
-            )
-            if (
-                self.is_moe_w4_dynamic and self.is_moe_input_quant is not None
-            ) or is_moe_w4a8_dynamic:
-                return NPUW4A8Int4DynamicMoEMethod()
-            elif self.is_moe_w4_dynamic and self.is_moe_input_quant is None:
-                return NPUW4A16Int4DynamicMoEMethod(self)
-            else:
-                return NPUW8A8Int8DynamicMoEMethod()
-        return None
-
-    def is_layer_skipped(
-        self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
-    ):
-        # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
-        proj_name = prefix.split(".")[-1]
-        if proj_name in fused_mapping:
-            shard_prefixes = [
-                prefix.replace(proj_name, shard_proj_name)
-                for shard_proj_name in fused_mapping[proj_name]
-            ]
-
-            is_skipped = None
-            for shard_prefix in shard_prefixes:
-                is_shard_skipped = (
-                    self.quant_description[shard_prefix + ".weight"] == "FLOAT"
-                )
-
-                if is_skipped is None:
-                    is_skipped = is_shard_skipped
-                elif is_shard_skipped != is_skipped:
-                    raise ValueError(
-                        f"Detected some but not all shards of {prefix} "
-                        "are quantized. All shards of fused layers "
-                        "to have the same precision."
-                    )
-        else:
-            is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT"
-
-        assert is_skipped is not None
-        return is_skipped
-
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
-    def is_dynamic_token_w4(self, weight_quant, input_quant) -> bool:
-        is_w4 = weight_quant.num_bits == 4
-        weight_strategy = (
-            weight_quant.strategy == QuantizationStrategy.TENSOR.value
-            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
-            or weight_quant.strategy == QuantizationStrategy.GROUP.value
-        )
-        if input_quant is not None:
-            is_token = (
-                weight_strategy
-                and input_quant.strategy == QuantizationStrategy.TOKEN.value
-            )
-            is_dynamic = not weight_quant.dynamic and input_quant.dynamic
-        else:
-            is_token = weight_strategy
-            is_dynamic = not weight_quant.dynamic
-
-        # Both symmetric and asymmetric input quantization supported.
-        # Only symmetric weight quantization supported.
-        return is_w4 and weight_quant.symmetric and is_token and is_dynamic
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
new file mode 100644
index 000000000000..0f302d3565ae
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -0,0 +1,426 @@
+from __future__ import annotations
+
+import logging
+from types import MappingProxyType
+from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+from pydantic import BaseModel
+
+# from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+#     NPUW4A8Int4DynamicMoEMethod,
+#     NPUW4A16Int4DynamicMoEMethod,
+#     NPUW8A8Int8DynamicMoEMethod,
+# )
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+    _NPULinearMethodBase
+    # NPUW8A8Int8DynamicLinearMethod,
+    # NPUW8A8Int8LinearMethod,
+)
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import (
+    ModelSlimMoEMethod,
+)
+from sglang.srt.layers.quantization.compressed_tensors.utils import (
+    find_matched_target,
+    is_activation_quantization_format,
+    should_ignore_layer
+)
+#from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.utils import apply_module_patch
+
+logger = logging.getLogger(__name__)
+
+# func refers to RMSNorm.__init__
+def npu_wrapper_rmsnorm_init(func):
+    def init(self, hidden_size: int, **extra_args) -> None:
+        func(self, hidden_size, **extra_args)
+        self.ignore_anti = True
+        # The Ascend w8a8_int8 quantization requires adding a bias in rmsnorm
+        self.bias = torch.nn.Parameter(torch.zeros(hidden_size), requires_grad=False)
+
+    return init
+
+# func refers to RMSNorm.forward_oot
+def npu_wrapper_rmsnorm_forward(func):
+    def _rmsnorm_forward_oot(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        from sgl_kernel_npu.norm.add_rmsnorm_bias import add_rmsnorm_bias
+
+        if not x.is_contiguous():
+            x = x.contiguous()
+        if residual is not None:
+            out, residual_out = add_rmsnorm_bias(
+                x,
+                residual,
+                self.weight.data,
+                self.bias,
+                self.variance_epsilon,
+            )
+            return out.to(x.dtype), residual_out
+
+        out = torch.ops.npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0]
+        out = out + self.bias
+        return out.to(x.dtype)
+
+    return _rmsnorm_forward_oot
+
+
+class ModelSlimConfig(QuantizationConfig):
+    """
+    Config class for ModelSlim Quantization, a NPU-specific quantization type.
+    """
+
+    def __init__(self, quant_config: Dict[str, Any] = {}):
+        super().__init__()
+        self.quant_description = quant_config
+        # self.is_dynamic = quant_config.get("is_dynamic", False)
+        # self.is_moe_w4_dynamic = False
+        ignore = cast(List[str], quant_config.get("ignore", []))
+        self.ignore = ignore if ignore is not None else []
+        packed_modules_mapping = quant_config.get("packed_modules_mapping", {})
+        self.packed_modules_mapping = (
+            packed_modules_mapping if packed_modules_mapping is not None else {}
+        )
+        # self.target_scheme_map = (
+        #     CompressedTensorsConfig._quantization_scheme_map_from_config(
+        #         config=quant_config
+        #     )
+        # )
+        # target = "MoEGMM" if "MoEGMM" in self.target_scheme_map else "Linear"
+        # target_scheme = self.target_scheme_map.get(target, None)
+        # if target_scheme is None:
+        #     self.is_moe_w4_dynamic = False
+        # else:
+        #     weight_quant = target_scheme.get("weights")
+        #     input_quant = target_scheme.get("input_activations")
+        #     self.is_moe_w4_dynamic = self.is_dynamic_token_w4(weight_quant, input_quant)
+        #     self.is_moe_input_quant = input_quant
+
+        for name in self.quant_description.keys():
+            if "norm.bias" in name:
+                apply_module_patch(
+                    "sglang.srt.layers.layernorm.RMSNorm",
+                    "__init__",
+                    [npu_wrapper_rmsnorm_init],
+                )
+                apply_module_patch(
+                    "sglang.srt.layers.layernorm.RMSNorm",
+                    "forward_npu",
+                    [npu_wrapper_rmsnorm_forward],
+                )
+    
+    def get_linear_method(self) -> ModelSlimLinearMethod:
+        return ModelSlimLinearMethod(self)
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.int8, torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 0
+
+    @classmethod
+    def get_name(self) -> str:
+        return "modelslim"
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        filenames = ["quant_model_description.json"]
+        return filenames
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> ModelSlimConfig:
+        return cls(config)
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, LinearBase):
+            if should_ignore_layer(
+                prefix,
+                ignore=self.ignore,
+                fused_mapping=self.packed_modules_mapping,
+            ):
+                return UnquantizedLinearMethod()
+            key = "model"
+            if "vision_model" in prefix:
+                key = "vision_model"
+            elif "visual" in prefix:
+                key = "visual"
+            packed_modules_mapping_subset = self.packed_modules_mapping.get(key, {})
+            prefix_in_quant_config = prefix
+            proj_name = prefix.split(".")[-1]
+            if proj_name in packed_modules_mapping_subset:
+                prefix_in_quant_config = prefix.replace(
+                    proj_name, packed_modules_mapping_subset[proj_name][0]
+                )
+            # self.is_dynamic = (
+            #     self.quant_description[prefix_in_quant_config + ".weight"]
+            #     == "W8A8_DYNAMIC"
+            # )
+
+            if self.is_layer_skipped(prefix, packed_modules_mapping_subset):
+                return UnquantizedLinearMethod()
+            scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config)
+            if scheme is None:
+                return UnquantizedLinearMethod()
+            layer.scheme = scheme
+            return (
+                ModelSlimLinearMethod(self)
+            )
+        elif isinstance(layer, FusedMoE):
+            return ModelSlimMoeMethod.get_moe_method(self, layer, prefix)
+        return None
+
+    def _get_scheme_from_parts(
+            self, weight_quant: BaseModel, input_quant: BaseModel
+        ) -> ModelSlimScheme:
+
+            # Detect If Mixed Precision
+            # if self._is_wNa16_group_channel(weight_quant, input_quant):
+            #     if (
+            #         self.quant_format == CompressionFormat.pack_quantized.value
+            #         and weight_quant.num_bits in WNA16_SUPPORTED_BITS
+            #     ):
+            #         return CompressedTensorsWNA16(
+            #             num_bits=weight_quant.num_bits,
+            #             strategy=weight_quant.strategy,
+            #             group_size=weight_quant.group_size,
+            #             actorder=weight_quant.actorder,
+            #         )
+            #     else:
+            #         raise ImportError(
+            #             "Other method (CompressedTensorsW4A16Sparse24) is not supported now"
+            #         )
+
+            if is_activation_quantization_format(self.quant_format):
+                # if self._is_fp8_w8a8(weight_quant, input_quant):
+                #     is_fp8_w8a8_supported = self._check_scheme_supported(
+                #         CompressedTensorsW8A8Fp8.get_min_capability(), error=False
+                #     )
+                #     if is_fp8_w8a8_supported:
+                #         return CompressedTensorsW8A8Fp8(
+                #             strategy=weight_quant.strategy,
+                #             is_static_input_scheme=(
+                #                 input_quant and not input_quant.dynamic
+                #             ),
+                #         )
+                #     else:
+                #         # note: input_quant will be present for converted models;
+                #         # will be ignored during inference post loading
+                #         return CompressedTensorsW8A16Fp8(
+                #             strategy=weight_quant.strategy,
+                #             is_static_input_scheme=not input_quant.dynamic,
+                #         )
+
+                # # note: input_quant can be None
+                # if self._is_fp8_w8a16(weight_quant, input_quant):
+                #     is_static_input_scheme = input_quant and not input_quant.dynamic
+                #     return CompressedTensorsW8A16Fp8(
+                #         strategy=weight_quant.strategy,
+                #         is_static_input_scheme=is_static_input_scheme,
+                #     )
+
+                if self._is_static_tensor_w8a8(weight_quant, input_quant):
+                    return ModelSlimW8A8Int8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=True,
+                        input_symmetric=input_quant.symmetric,
+                    )
+
+                if self._is_dynamic_token_w8a8(weight_quant, input_quant):
+                    return ModelSlimW8A8Int8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=False,
+                        input_symmetric=input_quant.symmetric,
+                    )
+
+            raise NotImplementedError("No msmodelslim compatible scheme was found.")
+    
+    def get_scheme(
+            self, layer: torch.nn.Module, layer_name: Optional[str] = None
+        ) -> Optional[ModelSlimScheme]:
+            """
+            get_scheme method adjusted for modelslim, taken from
+            python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+            """
+            if self.target_scheme_map:
+                matched_target = find_matched_target(
+                    layer_name=layer_name,
+                    module=layer,
+                    targets=self.target_scheme_map.keys(),
+                    fused_mapping=self.packed_modules_mapping,
+                )
+
+                scheme_dict = self.target_scheme_map[matched_target]
+                weight_quant = scheme_dict.get("weights")
+                input_quant = scheme_dict.get("input_activations")
+            else:
+                # Find the quant_scheme
+                scheme = self._get_scheme_from_parts(  # type: ignore
+                    weight_quant=weight_quant,
+                    input_quant=input_quant,
+                )
+
+            # Ascend doesn't support device capability
+            # self._check_scheme_supported(scheme.get_min_capability())
+            logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
+            return scheme
+
+    def is_layer_skipped(
+        self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+    ):
+        # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
+        proj_name = prefix.split(".")[-1]
+        if proj_name in fused_mapping:
+            shard_prefixes = [
+                prefix.replace(proj_name, shard_proj_name)
+                for shard_proj_name in fused_mapping[proj_name]
+            ]
+
+            is_skipped = None
+            for shard_prefix in shard_prefixes:
+                is_shard_skipped = (
+                    self.quant_description[shard_prefix + ".weight"] == "FLOAT"
+                )
+
+                if is_skipped is None:
+                    is_skipped = is_shard_skipped
+                elif is_shard_skipped != is_skipped:
+                    raise ValueError(
+                        f"Detected some but not all shards of {prefix} "
+                        "are quantized. All shards of fused layers "
+                        "to have the same precision."
+                    )
+        else:
+            is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT"
+
+        assert is_skipped is not None
+        return is_skipped
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    def is_dynamic_token_w4(self, weight_quant, input_quant) -> bool:
+        is_w4 = weight_quant.num_bits == 4
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+            or weight_quant.strategy == QuantizationStrategy.GROUP.value
+        )
+        if input_quant is not None:
+            is_token = (
+                weight_strategy
+                and input_quant.strategy == QuantizationStrategy.TOKEN.value
+            )
+            is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+        else:
+            is_token = weight_strategy
+            is_dynamic = not weight_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_w4 and weight_quant.symmetric and is_token and is_dynamic
+
+    def _is_static_tensor_w8a8(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        )
+        is_tensor = (
+            weight_strategy
+            and input_quant.strategy == QuantizationStrategy.TENSOR.value
+        )
+        is_static = not weight_quant.dynamic and not input_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_tensor and weight_quant.symmetric and is_static
+
+    def _is_dynamic_token_w8a8(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        )
+        is_token = (
+            weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
+        )
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
+
+
+class ModelSlimLinearMethod(_NPULinearMethodBase):
+
+    def __init__(self, quantization_config: ModelSlimConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """
+        Use the ModelSlimScheme associated with each layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.scheme.create_weights(
+            layer=layer,
+            input_size=input_size,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        """
+        Use the output of create_weights and the CompressedTensorsScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, x, bias=bias)
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
new file mode 100644
index 000000000000..bee981b3d3b1
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -0,0 +1,57 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/v0.8.2/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import enum
+import logging
+from enum import Enum
+from typing import TYPE_CHECKING
+
+import torch
+from compressed_tensors import CompressionFormat
+from compressed_tensors.quantization import QuantizationStrategy
+
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+
+logger = logging.getLogger(__name__)
+
+
+__all__ = [
+    "ModelSlimMoEMethod",
+]
+
+
+class ModelSlimMoEMethod(FusedMoEMethodBase):
+    def __new__(cls, *args, **kwargs):
+        if cls is ModelSlimMoEMethod:
+            return super().__new__(cls)
+        return super().__new__(cls)
+
+    @staticmethod
+    def get_moe_method(
+        quant_config: ModelSlimConfig,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> "ModelSlimMoEMethod":
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+
+        weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
+        input_quant = quant_config.target_scheme_map["Linear"].get("input_activations")
+        is_moe_w4_dynamic = quant_config.is_dynamic_token_w4(weight_quant, input_quant)
+        is_moe_input_quant = input_quant
+
+        if (
+            is_moe_w4_dynamic and is_moe_input_quant is not None
+        ) or quant_config._is_moe_w4a8_dynamic(prefix, weight_quant, input_quant):
+            return NPUW4A8Int4DynamicMoEMethod(quant_config)
+        elif is_moe_w4_dynamic and is_moe_input_quant is None:
+            return NPUW4A16Int4DynamicMoEMethod(quant_config)
+        else:
+            return NPUW8A8Int8DynamicMoEMethod(quant_config)
+        # else:
+        #     raise RuntimeError(
+        #         f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
+        #     )
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py
new file mode 100644
index 000000000000..e69de29bb2d1

From 66c7517f0288e10f6322566e3cb7d163e09d32d1 Mon Sep 17 00:00:00 2001
From: TamirBaydasov <mr.jeijy@gmail.com>
Date: Wed, 10 Dec 2025 15:57:26 +0300
Subject: [PATCH 013/175] Working msmodelslim structure, W8A8, W8A8 MoE, W4A4

---
 python/sglang/srt/configs/model_config.py     |   2 +-
 .../npu/quantization/fused_moe_method_npu.py  | 113 +----------
 .../npu/quantization/linear_method_npu.py     | 159 +++------------
 .../hardware_backend/npu/quantization/w4a4.py |  42 ----
 .../srt/layers/quantization/__init__.py       |   2 +-
 .../quantization/msmodelslim/msmodelslim.py   | 182 +++++++++--------
 .../msmodelslim/msmodelslim_moe.py            | 191 ++++++++++++++++--
 .../msmodelslim/schemes/__init__.py           |  11 +
 .../msmodelslim/schemes/msmodelslim_scheme.py |  56 +++++
 .../schemes/msmodelslim_w4a4_int4.py          |  95 +++++++++
 .../schemes/msmodelslim_w8a8_int8.py          | 140 +++++++++++++
 .../schemes/msmodelslim_w8a8_int8_moe.py      |   0
 12 files changed, 597 insertions(+), 396 deletions(-)
 delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a4.py
 create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
 delete mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index e4af64b1a116..100ebf48c7ca 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -26,7 +26,7 @@
 from sglang.srt.environ import envs
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import is_hip, retry
+from sglang.srt.utils import is_hip, retry, is_npu
 from sglang.srt.utils.hf_transformers_utils import (
     get_config,
     get_context_length,
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 938314b0f425..41991a5e6a4a 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -141,115 +141,8 @@ def npu_fused_moe_without_routing_weights_bf16(
 
 
 class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase):
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ) -> None:
-        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
-
-        self.num_experts = num_experts
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
-        )
-
-        # weight
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size,
-                dtype=torch.int8,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                dtype=torch.int8,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-        # scale
-        w13_weight_scale = torch.nn.Parameter(
-            torch.empty(
-                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-        w2_weight_scale = torch.nn.Parameter(
-            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-        # offset
-        w13_weight_offset = torch.nn.Parameter(
-            torch.empty(
-                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight_offset", w13_weight_offset)
-        set_weight_attrs(w13_weight_offset, extra_weight_attrs)
-        w2_weight_offset = torch.nn.Parameter(
-            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_offset", w2_weight_offset)
-        set_weight_attrs(w2_weight_offset, extra_weight_attrs)
-
-    def release_weight_cache(self, weight: torch.Tensor):
-        # .contiguous() introduces additional memory overhead and needs to be released using resize_(0)
-        origin_weight = weight.data.transpose(1, 2)
-        new_weight = origin_weight.contiguous()
-        origin_weight.untyped_storage().resize_(0)
-        return new_weight
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        weight_data = self.release_weight_cache(layer.w13_weight.data)
-        layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False)
-
-        weight_data = self.release_weight_cache(layer.w2_weight.data)
-        layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False)
-
-        layer.w13_weight_scale = torch.nn.Parameter(
-            layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32),
-            requires_grad=False,
-        )
-        layer.w2_weight_scale = torch.nn.Parameter(
-            layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False
-        )
-        layer.w13_weight_offset = torch.nn.Parameter(
-            layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
-        )
-        layer.w2_weight_offset = torch.nn.Parameter(
-            layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
-        )
-
-        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
-        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
-
-    def create_moe_runner(
-        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
-    ):
-        self.moe_runner_config = moe_runner_config
-
+    @staticmethod
     def apply(
-        self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
@@ -272,9 +165,9 @@ def apply(
             top_k=topk_ids.shape[1],
         )
         return StandardCombineInput(hidden_states=output)
-
+    
+    @staticmethod
     def apply_without_routing_weights(
-        self,
         layer,
         hidden_states,
         hidden_states_scale,
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
index 46db893b3495..7d61255e17e6 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
@@ -25,80 +25,9 @@ def __init__(
 
 
 class NPUW8A8Int8LinearMethod(_NPULinearMethodBase):
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        weight_loader = extra_weight_attrs.get("weight_loader")
-        output_size_per_partition = sum(output_partition_sizes)
-
-        weight = ModelWeightParameter(
-            data=torch.empty(
-                (output_size_per_partition, input_size_per_partition), dtype=torch.int8
-            ),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight", weight)
-
-        weight_scale = ChannelQuantScaleParameter(
-            data=torch.empty((output_size_per_partition, 1), dtype=params_dtype),
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight_scale", weight_scale)
-
-        weight_offset = ChannelQuantScaleParameter(
-            data=torch.empty((output_size_per_partition, 1), dtype=params_dtype),
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight_offset", weight_offset)
-
-        input_scale = PerTensorScaleParameter(
-            data=torch.empty(1, dtype=params_dtype),
-            weight_loader=weight_loader,
-        )
-        input_scale.ignore_warning = True
-        layer.register_parameter("input_scale", input_scale)
-
-        input_offset = PerTensorScaleParameter(
-            data=torch.empty(1, dtype=params_dtype),
-            weight_loader=weight_loader,
-        )
-        input_offset.ignore_warning = True
-        layer.register_parameter("input_offset", input_offset)
-
-        quant_bias = ChannelQuantScaleParameter(
-            data=torch.empty(output_size_per_partition, dtype=torch.int32),
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("quant_bias", quant_bias)
-
-        if params_dtype == torch.bfloat16:
-            deq_scale_dtype = torch.float32
-        elif params_dtype == torch.float16:
-            deq_scale_dtype = torch.int64
-        else:
-            raise ValueError(f"Unsupported params_dtype: {params_dtype}")
-        deq_scale = ChannelQuantScaleParameter(
-            data=torch.empty(output_size_per_partition, dtype=deq_scale_dtype),
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("deq_scale", deq_scale)
-
+    
+    @staticmethod
     def apply(
-        self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
@@ -129,75 +58,40 @@ def apply(
             output_dtype=original_dtype,
         )
 
-    def process_weights_after_loading(self, layer: torch.nn.Module):
-        layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        layer.weight.data = npu_format_cast(layer.weight.data)
-
-        layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
-        layer.weight_offset.data = torch.flatten(layer.weight_offset.data)
-
-        expanding_factor = layer.weight.data.shape[0]
-        layer.aclnn_input_scale = torch.nn.Parameter(
-            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
-            requires_grad=False,
-        )
-        layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
-            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
-            requires_grad=False,
-        )
-        layer.aclnn_input_offset = torch.nn.Parameter(
-            layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
-            requires_grad=False,
-        )
-
 
 class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase):
 
-    def create_weights(
-        self,
+    @staticmethod
+    def apply(
         layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        weight_loader = extra_weight_attrs.get("weight_loader")
-        output_size_per_partition = sum(output_partition_sizes)
-
-        weight = ModelWeightParameter(
-            data=torch.empty(
-                (output_size_per_partition, input_size_per_partition), dtype=torch.int8
-            ),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        original_dtype = x.dtype
+        quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant(x)
+        return torch.ops.npu.npu_quant_matmul(
+            quant_out,
+            layer.weight,
+            layer.weight_scale,
+            pertoken_scale=dynamic_scale,
+            bias=bias,
+            output_dtype=original_dtype,
         )
-        layer.register_parameter("weight", weight)
 
-        weight_scale = ChannelQuantScaleParameter(
-            data=torch.empty((output_size_per_partition, 1), dtype=params_dtype),
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight_scale", weight_scale)
 
-        weight_offset = ChannelQuantScaleParameter(
-            data=torch.empty((output_size_per_partition, 1), dtype=params_dtype),
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight_offset", weight_offset)
+class NPU_W4A4DynamicLinearMethod(_NPULinearMethodBase):
 
+    @staticmethod
     def apply(
-        self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
+        tp_rank: Optional[int] = 0,
     ) -> torch.Tensor:
         original_dtype = x.dtype
-        quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant(x)
+        quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant(
+            x, dst_type=torch.quint4x2
+        )
         return torch.ops.npu.npu_quant_matmul(
             quant_out,
             layer.weight,
@@ -205,11 +99,4 @@ def apply(
             pertoken_scale=dynamic_scale,
             bias=bias,
             output_dtype=original_dtype,
-        )
-
-    def process_weights_after_loading(self, layer: torch.nn.Module):
-        layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        layer.weight.data = npu_format_cast(layer.weight.data)
-
-        layer.weight_scale.data = layer.weight_scale.data.flatten()
-        layer.weight_offset.data = layer.weight_offset.data.flatten()
+        )
\ No newline at end of file
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py
deleted file mode 100644
index 4676b4655872..000000000000
--- a/python/sglang/srt/hardware_backend/npu/quantization/w4a4.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from typing import TYPE_CHECKING, List, Optional
-
-import torch
-
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
-from sglang.srt.hardware_backend.npu.quantization.utils import _NPULinearMethodBase
-
-class NPU_W4A4DynamicLinearMethodImpl:
-    """Linear method for NPU W4A4_DYNAMIC."""
-
-    def __init__(self):
-        self.transpose_weight = True
-
-    @staticmethod
-    def apply(
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-        tp_rank: Optional[int] = 0,
-    ) -> torch.Tensor:
-        original_dtype = x.dtype
-        quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(
-            x, dst_type=torch.quint4x2
-        )
-        return torch_npu.npu_quant_matmul(
-            quant_out,
-            layer.weight,
-            layer.weight_scale,
-            pertoken_scale=dynamic_scale,
-            bias=bias,
-            output_dtype=original_dtype,
-        )
-
-    def process_weights_after_loading(self, layer):
-        if self.transpose_weight:
-            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        layer.weight_scale.data = layer.weight_scale.data.flatten()
-        layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32)
-        layer.weight_offset.data = layer.weight_offset.data.flatten()
-        layer.weight.data = torch_npu.npu_convert_weight_to_int4pack(
-            layer.weight.data.to(torch.int32)
-        )
diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
index f4ec7d8c46a4..4aa1843a4d85 100644
--- a/python/sglang/srt/layers/quantization/__init__.py
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -79,7 +79,7 @@ def override_quantization_method(self, *args, **kwargs):
     )
 
 if is_npu():
-    from sglang.srt.hardware_backend.npu.quantization.modelslim import ModelSlimConfig
+    from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig
 
     BASE_QUANTIZATION_METHODS.update(
         {
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index 0f302d3565ae..b28893f0f42e 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -25,6 +25,11 @@
 from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import (
     ModelSlimMoEMethod,
 )
+from sglang.srt.layers.quantization.msmodelslim.schemes import (
+    ModelSlimScheme,
+    ModelSlimW8A8Int8,
+    ModelSlimW4A4Int4,
+)
 from sglang.srt.layers.quantization.compressed_tensors.utils import (
     find_matched_target,
     is_activation_quantization_format,
@@ -130,7 +135,7 @@ def get_min_capability(cls) -> int:
         return 0
 
     @classmethod
-    def get_name(self) -> str:
+    def get_name(cls) -> str:
         return "modelslim"
 
     @classmethod
@@ -188,9 +193,21 @@ def get_quant_method(
         return None
 
     def _get_scheme_from_parts(
-            self, weight_quant: BaseModel, input_quant: BaseModel
+            self, layer_name: str,
         ) -> ModelSlimScheme:
 
+        quant_type = self.quant_description[layer_name + '.weight']
+        if quant_type == "W8A8_DYNAMIC":
+            return ModelSlimW8A8Int8(
+                quant_config=self.quant_description,
+                prefix=layer_name
+            )
+        elif quant_type == "W4A4_DYNAMIC":
+            return ModelSlimW4A4Int4(
+                quant_config=self.quant_description,
+                prefix=layer_name
+            )
+
             # Detect If Mixed Precision
             # if self._is_wNa16_group_channel(weight_quant, input_quant):
             #     if (
@@ -208,7 +225,7 @@ def _get_scheme_from_parts(
             #             "Other method (CompressedTensorsW4A16Sparse24) is not supported now"
             #         )
 
-            if is_activation_quantization_format(self.quant_format):
+            #if is_activation_quantization_format(self.quant_format):
                 # if self._is_fp8_w8a8(weight_quant, input_quant):
                 #     is_fp8_w8a8_supported = self._check_scheme_supported(
                 #         CompressedTensorsW8A8Fp8.get_min_capability(), error=False
@@ -236,21 +253,7 @@ def _get_scheme_from_parts(
                 #         is_static_input_scheme=is_static_input_scheme,
                 #     )
 
-                if self._is_static_tensor_w8a8(weight_quant, input_quant):
-                    return ModelSlimW8A8Int8(
-                        strategy=weight_quant.strategy,
-                        is_static_input_scheme=True,
-                        input_symmetric=input_quant.symmetric,
-                    )
-
-                if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-                    return ModelSlimW8A8Int8(
-                        strategy=weight_quant.strategy,
-                        is_static_input_scheme=False,
-                        input_symmetric=input_quant.symmetric,
-                    )
-
-            raise NotImplementedError("No msmodelslim compatible scheme was found.")
+            #raise NotImplementedError("No msmodelslim compatible scheme was found.")
     
     def get_scheme(
             self, layer: torch.nn.Module, layer_name: Optional[str] = None
@@ -259,23 +262,24 @@ def get_scheme(
             get_scheme method adjusted for modelslim, taken from
             python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
             """
-            if self.target_scheme_map:
-                matched_target = find_matched_target(
-                    layer_name=layer_name,
-                    module=layer,
-                    targets=self.target_scheme_map.keys(),
-                    fused_mapping=self.packed_modules_mapping,
-                )
-
-                scheme_dict = self.target_scheme_map[matched_target]
-                weight_quant = scheme_dict.get("weights")
-                input_quant = scheme_dict.get("input_activations")
-            else:
+            # if self.target_scheme_map:
+            #     matched_target = find_matched_target(
+            #         layer_name=layer_name,
+            #         module=layer,
+            #         targets=self.target_scheme_map.keys(),
+            #         fused_mapping=self.packed_modules_mapping,
+            #     )
+
+            #     scheme_dict = self.target_scheme_map[matched_target]
+            #     weight_quant = scheme_dict.get("weights")
+            #     input_quant = scheme_dict.get("input_activations")
+            # else:
                 # Find the quant_scheme
-                scheme = self._get_scheme_from_parts(  # type: ignore
-                    weight_quant=weight_quant,
-                    input_quant=input_quant,
-                )
+            scheme = self._get_scheme_from_parts(  # type: ignore
+                # weight_quant=weight_quant,
+                # input_quant=input_quant,
+                layer_name=layer_name,
+            )
 
             # Ascend doesn't support device capability
             # self._check_scheme_supported(scheme.get_min_capability())
@@ -316,61 +320,61 @@ def is_layer_skipped(
     def get_scaled_act_names(self) -> List[str]:
         return []
 
-    def is_dynamic_token_w4(self, weight_quant, input_quant) -> bool:
-        is_w4 = weight_quant.num_bits == 4
-        weight_strategy = (
-            weight_quant.strategy == QuantizationStrategy.TENSOR.value
-            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
-            or weight_quant.strategy == QuantizationStrategy.GROUP.value
-        )
-        if input_quant is not None:
-            is_token = (
-                weight_strategy
-                and input_quant.strategy == QuantizationStrategy.TOKEN.value
-            )
-            is_dynamic = not weight_quant.dynamic and input_quant.dynamic
-        else:
-            is_token = weight_strategy
-            is_dynamic = not weight_quant.dynamic
-
-        # Both symmetric and asymmetric input quantization supported.
-        # Only symmetric weight quantization supported.
-        return is_w4 and weight_quant.symmetric and is_token and is_dynamic
-
-    def _is_static_tensor_w8a8(
-        self, weight_quant: BaseModel, input_quant: BaseModel
-    ) -> bool:
-        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-        weight_strategy = (
-            weight_quant.strategy == QuantizationStrategy.TENSOR.value
-            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
-        )
-        is_tensor = (
-            weight_strategy
-            and input_quant.strategy == QuantizationStrategy.TENSOR.value
-        )
-        is_static = not weight_quant.dynamic and not input_quant.dynamic
-
-        # Both symmetric and asymmetric input quantization supported.
-        # Only symmetric weight quantization supported.
-        return is_8_bits and is_tensor and weight_quant.symmetric and is_static
-
-    def _is_dynamic_token_w8a8(
-        self, weight_quant: BaseModel, input_quant: BaseModel
-    ) -> bool:
-        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-        weight_strategy = (
-            weight_quant.strategy == QuantizationStrategy.TENSOR.value
-            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
-        )
-        is_token = (
-            weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
-        )
-        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
-
-        # Both symmetric and asymmetric input quantization supported.
-        # Only symmetric weight quantization supported.
-        return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
+    # def is_dynamic_token_w4(self, weight_quant, input_quant) -> bool:
+    #     is_w4 = weight_quant.num_bits == 4
+    #     weight_strategy = (
+    #         weight_quant.strategy == QuantizationStrategy.TENSOR.value
+    #         or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+    #         or weight_quant.strategy == QuantizationStrategy.GROUP.value
+    #     )
+    #     if input_quant is not None:
+    #         is_token = (
+    #             weight_strategy
+    #             and input_quant.strategy == QuantizationStrategy.TOKEN.value
+    #         )
+    #         is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+    #     else:
+    #         is_token = weight_strategy
+    #         is_dynamic = not weight_quant.dynamic
+
+    #     # Both symmetric and asymmetric input quantization supported.
+    #     # Only symmetric weight quantization supported.
+    #     return is_w4 and weight_quant.symmetric and is_token and is_dynamic
+
+    # def _is_static_tensor_w8a8(
+    #     self, weight_quant: BaseModel, input_quant: BaseModel
+    # ) -> bool:
+    #     is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+    #     weight_strategy = (
+    #         weight_quant.strategy == QuantizationStrategy.TENSOR.value
+    #         or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+    #     )
+    #     is_tensor = (
+    #         weight_strategy
+    #         and input_quant.strategy == QuantizationStrategy.TENSOR.value
+    #     )
+    #     is_static = not weight_quant.dynamic and not input_quant.dynamic
+
+    #     # Both symmetric and asymmetric input quantization supported.
+    #     # Only symmetric weight quantization supported.
+    #     return is_8_bits and is_tensor and weight_quant.symmetric and is_static
+
+    # def _is_dynamic_token_w8a8(
+    #     self, weight_quant: BaseModel, input_quant: BaseModel
+    # ) -> bool:
+    #     is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+    #     weight_strategy = (
+    #         weight_quant.strategy == QuantizationStrategy.TENSOR.value
+    #         or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+    #     )
+    #     is_token = (
+    #         weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
+    #     )
+    #     is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+
+    #     # Both symmetric and asymmetric input quantization supported.
+    #     # Only symmetric weight quantization supported.
+    #     return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
 
 
 class ModelSlimLinearMethod(_NPULinearMethodBase):
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index bee981b3d3b1..5dd239a6d1ab 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -5,21 +5,39 @@
 import enum
 import logging
 from enum import Enum
-from typing import TYPE_CHECKING
+from typing import Callable, Optional, TYPE_CHECKING
+from typing import Any, Dict, List
 
 import torch
-from compressed_tensors import CompressionFormat
-from compressed_tensors.quantization import QuantizationStrategy
 
-from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
 from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
 from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+from sglang.srt.layers.quantization.msmodelslim.schemes import (
+    ModelSlimScheme,
+)
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+    NPUW8A8Int8DynamicMoEMethod,
+)
+
+from sglang.srt.utils import set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe import MoeRunnerConfig
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+    from sglang.srt.layers.quantization.msmodelslim.msmodelslim import (
+        ModelSlimConfig,
+    )
 
 logger = logging.getLogger(__name__)
 
 
 __all__ = [
     "ModelSlimMoEMethod",
+    "ModelSlimW8A8Int8MoE",
 ]
 
 
@@ -38,20 +56,159 @@ def get_moe_method(
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
 
-        weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
-        input_quant = quant_config.target_scheme_map["Linear"].get("input_activations")
-        is_moe_w4_dynamic = quant_config.is_dynamic_token_w4(weight_quant, input_quant)
-        is_moe_input_quant = input_quant
-
-        if (
-            is_moe_w4_dynamic and is_moe_input_quant is not None
-        ) or quant_config._is_moe_w4a8_dynamic(prefix, weight_quant, input_quant):
-            return NPUW4A8Int4DynamicMoEMethod(quant_config)
-        elif is_moe_w4_dynamic and is_moe_input_quant is None:
-            return NPUW4A16Int4DynamicMoEMethod(quant_config)
-        else:
-            return NPUW8A8Int8DynamicMoEMethod(quant_config)
+        return ModelSlimW8A8Int8MoE(quant_config)
+        # weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
+        # input_quant = quant_config.target_scheme_map["Linear"].get("input_activations")
+        # is_moe_w4_dynamic = quant_config.is_dynamic_token_w4(weight_quant, input_quant)
+        # is_moe_input_quant = input_quant
+
+        # if (
+        #     is_moe_w4_dynamic and is_moe_input_quant is not None
+        # ) or quant_config._is_moe_w4a8_dynamic(prefix, weight_quant, input_quant):
+        #     return NPUW4A8Int4DynamicMoEMethod(quant_config)
+        # elif is_moe_w4_dynamic and is_moe_input_quant is None:
+        #     return NPUW4A16Int4DynamicMoEMethod(quant_config)
+        # else:
+        #     return NPUW8A8Int8DynamicMoEMethod(quant_config)
         # else:
         #     raise RuntimeError(
         #         f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
         #     )
+
+
+class ModelSlimW8A8Int8MoE(ModelSlimMoEMethod):
+
+    def __init__(
+            self, quant_config: Dict[str, Any], prefix: str = None,
+    ):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        self.num_experts = num_experts
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+
+        # weight
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+        # scale
+        w13_weight_scale = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        w2_weight_scale = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        # offset
+        w13_weight_offset = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_offset", w13_weight_offset)
+        set_weight_attrs(w13_weight_offset, extra_weight_attrs)
+        w2_weight_offset = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_offset", w2_weight_offset)
+        set_weight_attrs(w2_weight_offset, extra_weight_attrs)
+
+    def release_weight_cache(self, weight: torch.Tensor):
+        # .contiguous() introduces additional memory overhead and needs to be released using resize_(0)
+        origin_weight = weight.data.transpose(1, 2)
+        new_weight = origin_weight.contiguous()
+        origin_weight.untyped_storage().resize_(0)
+        return new_weight
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight_data = self.release_weight_cache(layer.w13_weight.data)
+        layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False)
+
+        weight_data = self.release_weight_cache(layer.w2_weight.data)
+        layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False)
+
+        layer.w13_weight_scale = torch.nn.Parameter(
+            layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32),
+            requires_grad=False,
+        )
+        layer.w2_weight_scale = torch.nn.Parameter(
+            layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+        layer.w13_weight_offset = torch.nn.Parameter(
+            layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+        layer.w2_weight_offset = torch.nn.Parameter(
+            layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+
+        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
+        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
+    ):
+        self.moe_runner_config = moe_runner_config
+    
+    def apply(
+        self,
+        layer,
+        dispatch_output: "StandardDispatchOutput",
+    ) -> "CombineInput":
+        return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
+    
+
+    def apply_without_routing_weights(
+        self,
+        layer,
+        hidden_states,
+        hidden_states_scale,
+        group_list_type,
+        group_list,
+        output_dtype,
+    ):
+        return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(layer,
+                                                                        hidden_states,
+                                                                        hidden_states_scale,
+                                                                        group_list_type,
+                                                                        group_list,
+                                                                        output_dtype,)
\ No newline at end of file
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
index e69de29bb2d1..997892772977 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from .msmodelslim_scheme import ModelSlimScheme
+from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8
+from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4
+
+__all__ = [
+    "ModelSlimScheme",
+    "ModelSlimW8A8Int8",
+    "ModelSlimW4A4Int4",
+]
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py
index e69de29bb2d1..7e6669abe412 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py
@@ -0,0 +1,56 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+__all__ = ["ModelSlimScheme"]
+
+
+class ModelSlimScheme(ABC):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes supported by CompressedTensors.
+    """
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+    ):
+        """
+        Run the forward pass for the particular scheme. This is where
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
new file mode 100644
index 000000000000..87404a6269be
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
@@ -0,0 +1,95 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Callable, Optional
+
+import torch
+from torch.nn import Parameter
+
+from typing import Any, Dict, List
+
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+    NPU_W4A4DynamicLinearMethod,
+)
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.msmodelslim.schemes import (
+    ModelSlimScheme,
+)
+
+from sglang.srt.utils import set_weight_attrs
+
+
+class ModelSlimW4A4Int4(ModelSlimScheme):
+
+    def __init__(
+        self, quant_config: Dict[str, any], prefix: str,
+    ):
+        self.quant_config = quant_config
+        self.transpose_weight = True
+        self.is_dynamic = (
+            self.quant_config[prefix + ".weight"]
+            == "W4A4_DYNAMIC"
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        weight_dict = self.quant_method.get_weight(
+            input_size_per_partition, output_size_per_partition, params_dtype
+        )
+        for weight_name, weight_param in weight_dict.items():
+            param = torch.nn.Parameter(weight_param, requires_grad=False)
+            set_weight_attrs(param, {"input_dim": 1, "output_dim": 0})
+            layer.register_parameter(weight_name, param)
+            set_weight_attrs(param, extra_weight_attrs)
+
+        pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
+        for pertensor_name, pertensor_param in pertensor_dict.items():
+            param = PerTensorScaleParameter(
+                data=pertensor_param, weight_loader=weight_loader
+            )
+            # disable warning
+            param.ignore_warning = True
+            layer.register_parameter(pertensor_name, param)
+
+        perchannel_dict = self.quant_method.get_perchannel_param(
+            output_size_per_partition, params_dtype
+        )
+        for perchannel_name, perchannel_param in perchannel_dict.items():
+            param = torch.nn.Parameter(perchannel_param, requires_grad=False)
+            set_weight_attrs(param, {"output_dim": 0})
+            layer.register_parameter(perchannel_name, param)
+            set_weight_attrs(param, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer):
+        if self.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
+        layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32)
+        layer.weight_offset.data = layer.weight_offset.data.flatten()
+        layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack(
+            layer.weight.data.to(torch.int32)
+        )
+    
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return NPU_W4A4DynamicLinearMethod.apply(layer, x, bias)
\ No newline at end of file
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
index e69de29bb2d1..7963c87200c4 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
@@ -0,0 +1,140 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Callable, Optional
+
+import torch
+from torch.nn import Parameter
+
+from typing import Any, Dict, List
+
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+    NPUW8A8Int8DynamicLinearMethod,
+    NPUW8A8Int8LinearMethod
+)
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.msmodelslim.schemes import (
+    ModelSlimScheme,
+)
+
+
+class ModelSlimW8A8Int8(ModelSlimScheme):
+
+    def __init__(
+        self, quant_config: Dict[str, any], prefix: str,
+    ):
+        self.quant_config = quant_config
+        self.is_dynamic = (
+            self.quant_config[prefix + ".weight"]
+            == "W8A8_DYNAMIC"
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        output_size_per_partition = sum(output_partition_sizes)
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                (output_size_per_partition, input_size_per_partition), dtype=torch.int8
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((output_size_per_partition, 1), dtype=params_dtype),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+        weight_offset = ChannelQuantScaleParameter(
+            data=torch.empty((output_size_per_partition, 1), dtype=params_dtype),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_offset", weight_offset)
+        
+        if not self.is_dynamic:
+            input_scale = PerTensorScaleParameter(
+                data=torch.empty(1, dtype=params_dtype),
+                weight_loader=weight_loader,
+            )
+            input_scale.ignore_warning = True
+            layer.register_parameter("input_scale", input_scale)
+
+            input_offset = PerTensorScaleParameter(
+                data=torch.empty(1, dtype=params_dtype),
+                weight_loader=weight_loader,
+            )
+            input_offset.ignore_warning = True
+            layer.register_parameter("input_offset", input_offset)
+
+            quant_bias = ChannelQuantScaleParameter(
+                data=torch.empty(output_size_per_partition, dtype=torch.int32),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("quant_bias", quant_bias)
+
+            if params_dtype == torch.bfloat16:
+                deq_scale_dtype = torch.float32
+            elif params_dtype == torch.float16:
+                deq_scale_dtype = torch.int64
+            else:
+                raise ValueError(f"Unsupported params_dtype: {params_dtype}")
+            deq_scale = ChannelQuantScaleParameter(
+                data=torch.empty(output_size_per_partition, dtype=deq_scale_dtype),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("deq_scale", deq_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight.data = npu_format_cast(layer.weight.data)
+
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
+        layer.weight_offset.data = layer.weight_offset.data.flatten()
+
+        if not self.is_dynamic:
+            expanding_factor = layer.weight.data.shape[0]
+            layer.aclnn_input_scale = torch.nn.Parameter(
+                layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+                requires_grad=False,
+            )
+            layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
+                layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+                requires_grad=False,
+            )
+            layer.aclnn_input_offset = torch.nn.Parameter(
+                layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
+                requires_grad=False,
+            )
+    
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.is_dynamic:
+            return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias)
+        else:
+            return NPUW8A8Int8LinearMethod.apply(layer, x, bias)
\ No newline at end of file
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8_moe.py
deleted file mode 100644
index e69de29bb2d1..000000000000

From ccfe6f63ba3f74115f1239826d224df1c751d673 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 11 Dec 2025 13:54:24 +0300
Subject: [PATCH 014/175] Delete w4a16_moe.py

---
 .../npu/quantization/w4a16_moe.py             | 195 ------------------
 1 file changed, 195 deletions(-)
 delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py
deleted file mode 100644
index 2f3f2a4539f3..000000000000
--- a/python/sglang/srt/hardware_backend/npu/quantization/w4a16_moe.py
+++ /dev/null
@@ -1,195 +0,0 @@
-from typing import TYPE_CHECKING
-
-import numpy as np
-import torch
-
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
-from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
-
-class NPUW4A8Int4DynamicMoEMethod(FusedMoEMethodBase):
-
-    def __init__(self) -> None:
-        self.group_size = 256
-        self.tp_size = 1
-
-    def pack_to_int32(self, weight: torch.Tensor):
-        assert weight.dim() == 3
-        if weight.dtype == torch.int32:
-            # pack 8 int4 to int32, we use a int32 to represent a int4
-            assert (
-                weight.shape[-1] % 8 == 0
-            ), "the last dim of weight needs to be divided by 8"
-            new_weight = torch.ops.npu.npu_convert_weight_to_int4pack(
-                weight.flatten(0, 1)
-            )
-            new_weight = new_weight.view(weight.shape[0], weight.shape[1], -1)
-        elif weight.dtype == torch.int8:
-            # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
-            assert (
-                weight.shape[-1] % 4 == 0
-            ), "the last dim of weight needs to be divided by 4"
-            new_weight = weight.view(torch.int32).contiguous()
-        else:
-            raise ValueError(f"{weight.dtype=} is not supported !")
-        return new_weight
-
-    def unpack_from_int32(
-        self,
-        value: torch.Tensor,
-        num_bits: int,
-        shape: torch.Size = None,
-        packed_dim=1,
-    ) -> torch.Tensor:
-        """
-        Unpacks a tensor of packed int32 weights into individual int8s, maintaining the
-        original bit range.
-
-        Return tensors in int8
-
-        :param value: tensor to unpack
-        :param num_bits: number of bits to unpack each data point into
-        :param shape: shape to unpack into, used to remove padding
-        :returns: unpacked int8 tensor
-        """
-        if value.dtype is not torch.int32:
-            raise ValueError(
-                f"Expected {torch.int32} but got {value.dtype}, Aborting unpack."
-            )
-
-        if num_bits > 8:
-            raise ValueError("Unpacking is only supported for less than 8 bits")
-
-        pack_factor = 32 // num_bits
-
-        # unpack
-        mask = (1 << num_bits) - 1
-
-        if packed_dim == 1:
-            unpacked = torch.zeros(
-                (value.shape[0], value.shape[1] * pack_factor),
-                device=value.device,
-                dtype=torch.int32,
-            )
-            for i in range(pack_factor):
-                unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
-
-            # remove padding
-            if shape is not None:
-                original_row_size = int(shape[1])
-                unpacked = unpacked[:, :original_row_size]
-        else:
-            unpacked = torch.zeros(
-                (value.shape[0] * pack_factor, value.shape[1]),
-                device=value.device,
-                dtype=torch.int32,
-            )
-            for i in range(pack_factor):
-                unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask
-
-            # remove padding
-            original_row_size = int(shape[0])
-            unpacked = unpacked[:original_row_size, :]
-
-        # bits are packed in unsigned format, reformat to signed
-        # update the value range from unsigned to signed
-        offset = pow(2, num_bits) // 2
-        unpacked = (unpacked - offset).to(torch.int8)
-
-        return unpacked
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        w13_weight_scale = layer.w13_weight_scale.data.transpose(-1, -2).contiguous()
-        w2_weight_scale = layer.w2_weight_scale.data.transpose(-1, -2).contiguous()
-        layer.w13_weight_scale = torch.nn.Parameter(
-            w13_weight_scale, requires_grad=False
-        )
-        layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, requires_grad=False)
-
-        layer.w13_weight_offset = torch.nn.Parameter(
-            layer.w13_weight_offset.data.transpose(-1, -2).contiguous(),
-            requires_grad=False,
-        )
-        layer.w2_weight_offset = torch.nn.Parameter(
-            layer.w2_weight_offset.data.transpose(-1, -2).contiguous(),
-            requires_grad=False,
-        )
-
-        # w = [n, k // 8]  --> [k, n // 8]
-        # w13_weight = layer.w13_weight.data.transpose(1, 2).contiguous()
-        # w2_weight = layer.w2_weight.data.transpose(1, 2).contiguous()
-        unpacked_w13_weight = (
-            self.unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4)
-            .view(layer.w13_weight.data.shape[0], layer.w13_weight.data.shape[1], -1)
-            .transpose(1, 2)
-            .contiguous()
-            .int()
-        )
-        unpacked_w2_weight = (
-            self.unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4)
-            .view(layer.w2_weight.data.shape[0], layer.w2_weight.data.shape[1], -1)
-            .transpose(1, 2)
-            .contiguous()
-            .int()
-        )
-
-        w13_weight = self.pack_to_int32(unpacked_w13_weight)
-        w2_weight = self.pack_to_int32(unpacked_w2_weight)
-
-        layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-        layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
-
-    def create_moe_runner(
-        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
-    ):
-        self.moe_runner_config = moe_runner_config
-
-    def apply(
-        self,
-        layer,
-        dispatch_output: "StandardDispatchOutput",
-    ) -> "CombineInput":
-        # FIXME W4A8 only support with deepep
-        raise NotImplementedError(
-            f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep"
-        )
-
-    def apply_without_routing_weights(
-        self,
-        layer,
-        hidden_states,
-        hidden_states_scale,
-        group_list_type,
-        group_list,
-        output_dtype,
-    ):
-        hidden_states = torch.ops.npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[self.w13_weight],
-            scale=[self.w13_weight_scale],
-            bias=[self.w13_scale_bias],
-            per_token_scale=[hidden_states_scale],
-            group_list=group_list,
-            split_item=2,
-            group_type=0,
-            group_list_type=group_list_type,
-            output_dtype=output_dtype,
-        )[0]
-
-        # act_fn: swiglu
-        hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
-        hidden_states, swiglu_out_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
-
-        hidden_states = torch.ops.npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[self.w2_weight],
-            scale=[self.w2_weight_scale],
-            bias=[self.w2_scale_bias],
-            per_token_scale=[swiglu_out_scale],
-            group_list=group_list,
-            split_item=2,
-            group_type=0,
-            group_list_type=group_list_type,
-            output_dtype=output_dtype,
-        )[0]
-
-        return hidden_states

From 0a48b2bb1007687394cb5dc6c6728d2b0d37eb55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 11 Dec 2025 13:54:38 +0300
Subject: [PATCH 015/175] Delete w4a8.py

---
 .../hardware_backend/npu/quantization/w4a8.py | 119 ------------------
 1 file changed, 119 deletions(-)
 delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a8.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py
deleted file mode 100644
index 7cd4dc81486a..000000000000
--- a/python/sglang/srt/hardware_backend/npu/quantization/w4a8.py
+++ /dev/null
@@ -1,119 +0,0 @@
-from __future__ import annotations
-
-import importlib
-import sys
-from types import MappingProxyType
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    List,
-    Mapping,
-    Optional,
-    Tuple,
-    Union,
-    cast,
-)
-
-import torch
-from torch.nn.parameter import Parameter
-
-from sglang.srt.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
-from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
-from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
-from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
-from sglang.srt.layers.parameter import (
-    ChannelQuantScaleParameter,
-    ModelWeightParameter,
-    PerTensorScaleParameter,
-)
-from sglang.srt.layers.quantization.base_config import (
-    FusedMoEMethodBase,
-    LinearMethodBase,
-    QuantizationConfig,
-    QuantizeMethodBase,
-)
-from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
-from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
-from sglang.srt.layers.quantization.w8a8_int8 import NPU_W8A8DynamicLinearMethod
-from sglang.srt.utils import (
-    apply_module_patch,
-    cpu_has_amx_support,
-    is_cpu,
-    is_cuda,
-    is_npu,
-    set_weight_attrs,
-    use_intel_amx_backend,
-)
-
-if TYPE_CHECKING:
-    from sglang.srt.layers.moe.token_dispatcher import (
-        CombineInput,
-        StandardDispatchOutput,
-    )
-
-_is_cuda = is_cuda()
-_is_cpu_amx_available = cpu_has_amx_support()
-_is_cpu = is_cpu()
-_is_npu = is_npu()
-
-if _is_npu:
-    import torch_npu
-
-class NPU_W4A8DynamicLinearMethod:
-    """Linear method for NPU W4A8_DYNAMIC."""
-
-    def __init__(self):
-        self.transpose_weight = True
-        try:
-            self.group_size = self.quantization_config.get("group_size", 256)
-        except AttributeError:
-            self.group_size = 256
-        
-    @staticmethod
-    def process_scale_second(weight: torch.Tensor, scale: torch.Tensor,
-                             per_group_scale: torch.Tensor):
-        k, n = weight.shape
-        group_num, n = per_group_scale.shape
-        weight_high = weight.to(torch.float32).reshape(
-            group_num, -1, n) * per_group_scale.reshape(group_num, 1, n)
-        weight_high = weight_high.reshape(k, n)
-        bias = 8 * (weight_high.to(torch.float32) * scale).sum(dim=0)
-        antiquant_scale = (scale * per_group_scale).reshape(group_num, n)
-        return antiquant_scale.npu(), bias
-    
-    @staticmethod
-    def apply(
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-        tp_rank: Optional[int] = 0,
-    ) -> torch.Tensor:
-        group_size = 256
-        return torch_npu.npu_weight_quant_batchmatmul(
-            x,
-            layer.weight,
-            antiquant_scale=layer.weight_scale_second.to(x.dtype),
-            antiquant_group_size=group_size,
-        )
-
-    def process_weights_after_loading(self, layer):
-        if self.transpose_weight:
-            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-
-        layer.weight_scale.data = layer.weight_scale.data.flatten().to(
-            torch.float32)
-        layer.weight_offset.data = layer.weight_offset.data.flatten()
-        layer.weight_scale_second.data, scale_bias = self.process_scale_second(
-            layer.weight.data,
-            layer.weight_scale.data,
-            layer.weight_scale_second.data.transpose(0, 1).contiguous(),
-        )
-        param = torch.nn.Parameter(scale_bias, requires_grad=False)
-        layer.register_parameter("weight_scale_bias", param)
-        layer.weight.data = torch_npu.npu_convert_weight_to_int4pack(
-            layer.weight.data.to(torch.int32))

From f4fdb0e3e5a24c4c76bdb60d6d6b1092dc136431 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 11 Dec 2025 13:54:58 +0300
Subject: [PATCH 016/175] Delete w4a8_moe.py

---
 .../npu/quantization/w4a8_moe.py              | 148 ------------------
 1 file changed, 148 deletions(-)
 delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py b/python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py
deleted file mode 100644
index 3696c4d36380..000000000000
--- a/python/sglang/srt/hardware_backend/npu/quantization/w4a8_moe.py
+++ /dev/null
@@ -1,148 +0,0 @@
-from typing import TYPE_CHECKING
-
-import numpy as np
-import torch
-
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
-from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
-
-class NPUW4A8Int4DynamicMoEMethod(FusedMoEMethodBase):
-
-    def __init__(self) -> None:
-        self.group_size = 256
-        self.tp_size = 1
-
-    def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
-        scale = scale.transpose(1, 2).contiguous()
-        per_group_scale = per_group_scale.transpose(1, 2).contiguous()
-        group_num, k, n = weight.shape
-        # the weight of the new version is reduced by half by pack n, so it needs to be restored
-        n = n * 2
-        per_group_scale = per_group_scale.reshape(group_num, -1, n)
-        group_num, quantgroup_num, n = per_group_scale.shape
-        bias = None
-
-        scale_fp32 = (scale * per_group_scale).to(torch.float16).to(torch.float32)
-        scale_fp32_np = scale_fp32.cpu().numpy()
-        scale_fp32_np.dtype = np.uint32
-        sscale_uint64 = np.zeros((group_num, quantgroup_num, n * 2), dtype=np.uint32)
-
-        sscale_uint64[..., ::2] = scale_fp32_np
-
-        sscale_uint64_buffer = np.frombuffer(
-            sscale_uint64.tobytes(), dtype=np.int64
-        ).copy()
-        sscale_uint64_tensor = torch.from_numpy(sscale_uint64_buffer).reshape(
-            group_num, quantgroup_num, n
-        )
-        sscale_uint64_tensor = sscale_uint64_tensor.npu()
-        return sscale_uint64_tensor, bias
-
-    def update_bias(self, layer, w13_bias, w2_bias):
-        layer.w13_scale_bias.data = (
-            layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
-        )
-        layer.w2_scale_bias.data = (
-            layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
-        )
-
-    def pack_to_int32(self, weight: torch.Tensor):
-        # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
-        assert (
-            weight.shape[-1] % 4 == 0
-        ), "the last dim of weight needs to be divided by 4"
-        return weight.view(torch.int32).contiguous()
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        layer.w13_weight = torch.nn.Parameter(
-            layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False
-        )
-        layer.w2_weight = torch.nn.Parameter(
-            layer.w2_weight.data.transpose(1, 2).contiguous(), requires_grad=False
-        )
-
-        w13_weight_scale_second = (
-            layer.w13_weight_scale_second.data
-            if hasattr(layer, "w13_weight_scale_second")
-            else None
-        )
-        w2_weight_scale_second = (
-            layer.w2_weight_scale_second.data
-            if hasattr(layer, "w2_weight_scale_second")
-            else None
-        )
-        layer.w13_weight_scale.data, w13_bias = self.process_scale(
-            layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second
-        )
-        layer.w2_weight_scale.data, w2_bias = self.process_scale(
-            layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second
-        )
-        if hasattr(layer, "w13_weight_scale_second"):
-            # scale_second is no longer used, release this part of the memory
-            del layer.w13_weight_scale_second
-            del layer.w2_weight_scale_second
-            del layer.w13_weight_offset_second
-            del layer.w2_weight_offset_second
-
-        self.update_bias(layer, w13_bias, w2_bias)
-
-        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
-        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
-        layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
-        layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
-
-    def create_moe_runner(
-        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
-    ):
-        self.moe_runner_config = moe_runner_config
-
-    def apply(
-        self,
-        layer,
-        dispatch_output: "StandardDispatchOutput",
-    ) -> "CombineInput":
-        # FIXME W4A8 only support with deepep
-        raise NotImplementedError(
-            f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep"
-        )
-
-    def apply_without_routing_weights(
-        self,
-        layer,
-        hidden_states,
-        hidden_states_scale,
-        group_list_type,
-        group_list,
-        output_dtype,
-    ):
-        hidden_states = torch.ops.npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[self.w13_weight],
-            scale=[self.w13_weight_scale],
-            bias=[self.w13_scale_bias],
-            per_token_scale=[hidden_states_scale],
-            group_list=group_list,
-            split_item=2,
-            group_type=0,
-            group_list_type=group_list_type,
-            output_dtype=output_dtype,
-        )[0]
-
-        # act_fn: swiglu
-        hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
-        hidden_states, swiglu_out_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
-
-        hidden_states = torch.ops.npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[self.w2_weight],
-            scale=[self.w2_weight_scale],
-            bias=[self.w2_scale_bias],
-            per_token_scale=[swiglu_out_scale],
-            group_list=group_list,
-            split_item=2,
-            group_type=0,
-            group_list_type=group_list_type,
-            output_dtype=output_dtype,
-        )[0]
-
-        return hidden_states

From 1f4f87015537668eb98b83708e5805474b097b8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 11 Dec 2025 13:55:12 +0300
Subject: [PATCH 017/175] Delete w8a8.py

---
 .../hardware_backend/npu/quantization/w8a8.py | 100 ------------------
 1 file changed, 100 deletions(-)
 delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w8a8.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py b/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py
deleted file mode 100644
index f9ad7f4a16ac..000000000000
--- a/python/sglang/srt/hardware_backend/npu/quantization/w8a8.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from typing import TYPE_CHECKING, List, Optional
-
-import torch
-
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
-from sglang.srt.hardware_backend.npu.quantization.utils import _NPULinearMethodBase
-
-class NPUW8A8Int8LinearMethod(_NPULinearMethodBase):
-    """Linear method for NPU W8A8."""
-  
-    def __init__(self):
-        self.transpose_weight = True
-
-    @staticmethod
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        from sglang.srt.layers.linear import RowParallelLinear
-
-        original_dtype = x.dtype
-        if original_dtype != torch.int8:
-            x = torch.ops.npu.npu_quantize(
-                x,
-                layer.aclnn_input_scale_reciprocal,
-                layer.aclnn_input_offset,
-                torch.qint8,
-                -1,
-                False,
-            )
-        # Only fuse bias add into GEMM for rank 0 (this ensures that
-        # bias will not get added more than once in Attention TP>1 case)
-        if isinstance(layer, RowParallelLinear) and layer.tp_rank > 0:
-            quant_bias = None
-        else:
-            quant_bias = layer.quant_bias
-        return torch.ops.npu.npu_quant_matmul(
-            x,
-            layer.weight,
-            layer.deq_scale,
-            bias=quant_bias,
-            output_dtype=original_dtype,
-        )
-
-    def process_weights_after_loading(self, layer: torch.nn.Module):
-        if self.transpose_weight:
-            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        layer.weight.data = npu_format_cast(layer.weight.data)
-
-        layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
-        layer.weight_offset.data = torch.flatten(layer.weight_offset.data)
-
-        expanding_factor = layer.weight.data.shape[0]
-        layer.aclnn_input_scale = torch.nn.Parameter(
-            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
-            requires_grad=False,
-        )
-        layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
-            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
-            requires_grad=False,
-        )
-        layer.aclnn_input_offset = torch.nn.Parameter(
-            layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
-            requires_grad=False,
-        )
-
-
-class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase):
-    """Linear method for NPU W8A8_DYNAMIC."""
-  
-    def __init__(self):
-        self.transpose_weight = True
-
-    @staticmethod
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        original_dtype = x.dtype
-        quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant(x)
-        return torch.ops.npu.npu_quant_matmul(
-            quant_out,
-            layer.weight,
-            layer.weight_scale,
-            pertoken_scale=dynamic_scale,
-            bias=bias,
-            output_dtype=original_dtype,
-        )
-
-    def process_weights_after_loading(self, layer: torch.nn.Module):
-        if self.transpose_weight:
-            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        layer.weight.data = npu_format_cast(layer.weight.data)
-
-        layer.weight_scale.data = layer.weight_scale.data.flatten()
-        layer.weight_offset.data = layer.weight_offset.data.flatten()

From b5fcf782b0171e953ebbf344541538550716c2c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 11 Dec 2025 13:55:29 +0300
Subject: [PATCH 018/175] Delete w8a8_moe.py

---
 .../npu/quantization/w8a8_moe.py              | 215 ------------------
 1 file changed, 215 deletions(-)
 delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py b/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py
deleted file mode 100644
index 789e5b516ced..000000000000
--- a/python/sglang/srt/hardware_backend/npu/quantization/w8a8_moe.py
+++ /dev/null
@@ -1,215 +0,0 @@
-from typing import TYPE_CHECKING
-
-import numpy as np
-import torch
-
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
-from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
-
-def npu_fused_experts(
-    hidden_states: torch.Tensor,
-    w13: torch.Tensor,
-    w13_scale: torch.Tensor,
-    w2: torch.Tensor,
-    w2_scale: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    top_k: int,
-    **kwargs,
-):
-    w13_offset = kwargs.get("w13_offset", None)
-    w2_offset = kwargs.get("w2_offset", None)
-    use_wna16 = kwargs.get("use_wna16", False)
-
-    original_shape = hidden_states.shape
-    original_dtype = hidden_states.dtype
-    scale_dtype = original_dtype if original_dtype == torch.bfloat16 else torch.float32
-    if len(original_shape) == 3:
-        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-    num_tokens = hidden_states.shape[0]
-    num_experts = w13.shape[0]
-    row_idx_len = num_tokens * top_k
-    row_idx = (
-        torch.arange(0, row_idx_len, dtype=torch.int32, device=topk_weights.device)
-        .view(top_k, -1)
-        .permute(1, 0)
-        .contiguous()
-    )
-    hidden_states, expanded_row_idx, expanded_expert_idx = (
-        torch.ops.npu.npu_moe_init_routing(
-            hidden_states, row_idx=row_idx, expert_idx=topk_ids, active_num=num_tokens
-        )
-    )
-    expert_tokens = torch.ops.npu.npu_moe_compute_expert_tokens(
-        expanded_expert_idx, num_experts
-    )
-    expert_tokens = expert_tokens.to(torch.int64)
-    # gmm1: gate_up_proj
-    if not use_wna16:
-        hidden_states, pertoken_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
-        scale_args13 = {
-            "scale": [w13_scale.to(scale_dtype)],
-            "per_token_scale": [pertoken_scale],
-        }
-    else:
-        scale_args13 = {
-            "antiquant_scale": [w13_scale],
-            "antiquant_offset": [w13_offset],
-        }
-
-    hidden_states = torch.ops.npu.npu_grouped_matmul(
-        x=[hidden_states],
-        weight=[w13],
-        **scale_args13,
-        split_item=2,
-        group_list_type=0,
-        group_type=0,
-        group_list=expert_tokens,
-        output_dtype=original_dtype,
-    )[0]
-    # act_fn: swiglu
-    hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
-    if not use_wna16:
-        hidden_states, pertoken_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
-
-        scale_args2 = {
-            "scale": [w2_scale.to(scale_dtype)],
-            "per_token_scale": [pertoken_scale],
-        }
-    else:
-        scale_args2 = {"antiquant_scale": [w2_scale], "antiquant_offset": [w2_offset]}
-    # gmm2: down_proj
-    hidden_states = torch.ops.npu.npu_grouped_matmul(
-        x=[hidden_states],
-        weight=[w2],
-        **scale_args2,
-        split_item=2,
-        group_list_type=0,
-        group_type=0,
-        group_list=expert_tokens,
-        output_dtype=original_dtype,
-    )[0]
-
-    final_hidden_states = torch.ops.npu.npu_moe_finalize_routing(
-        hidden_states,
-        skip1=None,
-        skip2=None,
-        bias=None,
-        scales=topk_weights,
-        expanded_src_to_dst_row=expanded_row_idx,
-        export_for_source_row=topk_ids,
-    )
-    if len(original_shape) == 3:
-        final_hidden_states = final_hidden_states.view(original_shape)
-    return final_hidden_states
-
-class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase):
-
-    ### TODO remove this ###
-    def release_weight_cache(self, weight: torch.Tensor):
-        # .contiguous() introduces additional memory overhead and needs to be released using resize_(0)
-        origin_weight = weight.data.transpose(1, 2)
-        new_weight = origin_weight.contiguous()
-        origin_weight.untyped_storage().resize_(0)
-        return new_weight
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        weight_data = self.release_weight_cache(layer.w13_weight.data)
-        layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False)
-
-        weight_data = self.release_weight_cache(layer.w2_weight.data)
-        layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False)
-
-        layer.w13_weight_scale = torch.nn.Parameter(
-            layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32),
-            requires_grad=False,
-        )
-        layer.w2_weight_scale = torch.nn.Parameter(
-            layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False
-        )
-        layer.w13_weight_offset = torch.nn.Parameter(
-            layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
-        )
-        layer.w2_weight_offset = torch.nn.Parameter(
-            layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
-        )
-
-        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
-        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
-
-    def create_moe_runner(
-        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
-    ):
-        self.moe_runner_config = moe_runner_config
-
-    def apply(
-        self,
-        layer,
-        dispatch_output: "StandardDispatchOutput",
-    ) -> "CombineInput":
-        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
-
-        x = dispatch_output.hidden_states
-        topk_output = dispatch_output.topk_output
-
-        topk_weights, topk_ids, _ = topk_output
-        topk_ids = topk_ids.to(torch.int32)
-        topk_weights = topk_weights.to(x.dtype)
-        output = npu_fused_experts(
-            hidden_states=x,
-            w13=layer.w13_weight,
-            w13_scale=layer.w13_weight_scale,
-            w2=layer.w2_weight,
-            w2_scale=layer.w2_weight_scale,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            top_k=topk_ids.shape[1],
-        )
-        return StandardCombineInput(hidden_states=output)
-
-    def apply_without_routing_weights(
-        self,
-        layer,
-        hidden_states,
-        hidden_states_scale,
-        group_list_type,
-        group_list,
-        output_dtype,
-    ):
-        # gmm1: gate_up_proj
-        hidden_states = torch.ops.npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[layer.w13_weight],
-            split_item=2,
-            group_list_type=group_list_type,
-            group_type=0,
-            group_list=group_list,
-            output_dtype=torch.int32,
-        )[0]
-
-        # act_fn: swiglu
-        hidden_states, swiglu_out_scale = torch.ops.npu.npu_dequant_swiglu_quant(
-            x=hidden_states,
-            weight_scale=layer.w13_weight_scale,
-            activation_scale=hidden_states_scale,
-            bias=None,
-            quant_scale=None,
-            quant_offset=None,
-            group_index=group_list,
-            activate_left=True,
-            quant_mode=1,
-        )
-
-        # gmm2: down_proj
-        hidden_states = torch.ops.npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[layer.w2_weight],
-            scale=[layer.w2_weight_scale.to(output_dtype)],
-            per_token_scale=[swiglu_out_scale],
-            split_item=2,
-            group_list_type=group_list_type,
-            group_type=0,
-            group_list=group_list,
-            output_dtype=output_dtype,
-        )[0]
-        return hidden_states

From ba57bc71a47409e436772f2c435503a0cfcbbf30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 11 Dec 2025 13:55:43 +0300
Subject: [PATCH 019/175] Delete utils.py

---
 .../hardware_backend/npu/quantization/utils.py    | 15 ---------------
 1 file changed, 15 deletions(-)
 delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/utils.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/utils.py b/python/sglang/srt/hardware_backend/npu/quantization/utils.py
deleted file mode 100644
index 0350d85e6400..000000000000
--- a/python/sglang/srt/hardware_backend/npu/quantization/utils.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from typing import TYPE_CHECKING, List, Optional
-
-from sglang.srt.layers.quantization.base_config import LinearMethodBase
-
-if TYPE_CHECKING:
-    from sglang.srt.layers.quantization.base_config import QuantizationConfig
-
-class _NPULinearMethodBase(LinearMethodBase):
-
-    def __init__(
-        self,
-        quant_config: Optional["QuantizationConfig"] = None,
-    ):
-        super().__init__()
-        self.quant_config = quant_config

From a5704f1655c51a3438a43171bb39c308e86478a2 Mon Sep 17 00:00:00 2001
From: TamirBaydasov <mr.jeijy@gmail.com>
Date: Thu, 11 Dec 2025 19:35:24 +0300
Subject: [PATCH 020/175] Move process_weights to kernel-side, add npu
 compressed-tensors w8a8int8 support

---
 .../npu/quantization/fused_moe_method_npu.py  |  33 ++++
 .../npu/quantization/linear_method_npu.py     |  40 +++++
 .../compressed_tensors/compressed_tensors.py  |  41 +++--
 .../compressed_tensors/schemes/__init__.py    |   8 +-
 .../schemes/compressed_tensors_w8a8_int8.py   | 155 +++++++++++-------
 .../quantization/msmodelslim/msmodelslim.py   |   2 +-
 .../msmodelslim/msmodelslim_moe.py            |  30 +---
 .../schemes/msmodelslim_w4a4_int4.py          |  10 +-
 .../schemes/msmodelslim_w8a8_int8.py          |  24 +--
 9 files changed, 216 insertions(+), 127 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 877a0c406355..5b5098ed567c 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -141,6 +141,39 @@ def npu_fused_moe_without_routing_weights_bf16(
 
 
 class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase):
+
+    def release_weight_cache(self, weight: torch.Tensor):
+        # .contiguous() introduces additional memory overhead and needs to be released using resize_(0)
+        origin_weight = weight.data.transpose(1, 2)
+        new_weight = origin_weight.contiguous()
+        origin_weight.untyped_storage().resize_(0)
+        return new_weight
+
+    @classmethod
+    def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
+        weight_data = cls.release_weight_cache(layer.w13_weight.data)
+        layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False)
+
+        weight_data = cls.release_weight_cache(layer.w2_weight.data)
+        layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False)
+
+        layer.w13_weight_scale = torch.nn.Parameter(
+            layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32),
+            requires_grad=False,
+        )
+        layer.w2_weight_scale = torch.nn.Parameter(
+            layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+        layer.w13_weight_offset = torch.nn.Parameter(
+            layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+        layer.w2_weight_offset = torch.nn.Parameter(
+            layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+
+        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
+        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
+
     @staticmethod
     def apply(
         layer,
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
index 7d61255e17e6..6481b4f79bf4 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
@@ -58,6 +58,28 @@ def apply(
             output_dtype=original_dtype,
         )
 
+    @staticmethod
+    def process_weights_after_loading(layer: torch.nn.Module):
+        layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight.data = npu_format_cast(layer.weight.data)
+
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
+        layer.weight_offset.data = layer.weight_offset.data.flatten()
+
+        expanding_factor = layer.weight.data.shape[0]
+        layer.aclnn_input_scale = torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+        layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+        layer.aclnn_input_offset = torch.nn.Parameter(
+            layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+
 
 class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase):
 
@@ -78,6 +100,14 @@ def apply(
             output_dtype=original_dtype,
         )
 
+    @staticmethod
+    def process_weights_after_loading(layer: torch.nn.Module):
+        layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight.data = npu_format_cast(layer.weight.data)
+
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
+        layer.weight_offset.data = layer.weight_offset.data.flatten()
+
 
 class NPU_W4A4DynamicLinearMethod(_NPULinearMethodBase):
 
@@ -99,4 +129,14 @@ def apply(
             pertoken_scale=dynamic_scale,
             bias=bias,
             output_dtype=original_dtype,
+        )
+
+    @staticmethod
+    def process_weights_after_loading(layer):
+        layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
+        layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32)
+        layer.weight_offset.data = layer.weight_offset.data.flatten()
+        layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack(
+            layer.weight.data.to(torch.int32)
         )
\ No newline at end of file
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index 31f47b88bc2f..c8f8b6ee073d 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -31,7 +31,8 @@
     WNA16_SUPPORTED_BITS,
     CompressedTensorsScheme,
     CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A8Int8,
+    GPUCompressedTensorsW8A8Int8,
+    NPUCompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8,
     CompressedTensorsWNA16,
 )
@@ -42,6 +43,10 @@
 )
 from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.utils import is_npu
+
+_is_npu = is_npu()
+
 
 logger = logging.getLogger(__name__)
 
@@ -439,18 +444,32 @@ def _get_scheme_from_parts(
                 )
 
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8Int8(
-                    strategy=weight_quant.strategy,
-                    is_static_input_scheme=True,
-                    input_symmetric=input_quant.symmetric,
-                )
+                if _is_npu:
+                    return NPUCompressedTensorsW8A8Int8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=True,
+                        input_symmetric=input_quant.symmetric,
+                    )
+                else:
+                    return GPUCompressedTensorsW8A8Int8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=True,
+                        input_symmetric=input_quant.symmetric,
+                    )
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8Int8(
-                    strategy=weight_quant.strategy,
-                    is_static_input_scheme=False,
-                    input_symmetric=input_quant.symmetric,
-                )
+                if _is_npu:
+                    return NPUCompressedTensorsW8A8Int8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=False,
+                        input_symmetric=input_quant.symmetric,
+                    )
+                else:
+                    return GPUCompressedTensorsW8A8Int8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=False,
+                        input_symmetric=input_quant.symmetric,
+                    )
 
         raise NotImplementedError("No compressed-tensors compatible scheme was found.")
 
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py
index 6d9871917bbb..e424e5d7b448 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -2,7 +2,10 @@
 
 from .compressed_tensors_scheme import CompressedTensorsScheme
 from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
-from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
+from .compressed_tensors_w8a8_int8 import (
+    GPUCompressedTensorsW8A8Int8,
+    NPUCompressedTensorsW8A8Int8,
+)
 from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16
 
@@ -10,7 +13,8 @@
     "CompressedTensorsScheme",
     "CompressedTensorsW8A8Fp8",
     "CompressedTensorsW8A16Fp8",
-    "CompressedTensorsW8A8Int8",
+    "GPUCompressedTensorsW8A8Int8",
+    "NPUCompressedTensorsW8A8Int8",
     "CompressedTensorsWNA16",
     "WNA16_SUPPORTED_BITS",
 ]
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 9bca2834d646..278584198919 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -15,6 +15,10 @@
 from sglang.srt.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme,
 )
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+    NPUW8A8Int8DynamicLinearMethod,
+    NPUW8A8Int8LinearMethod
+)
 from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
 from sglang.srt.layers.quantization.utils import requantize_with_max_scale
 from sglang.srt.utils import is_cuda
@@ -33,6 +37,73 @@ def __init__(
         self.is_static_input_scheme = is_static_input_scheme
         self.input_symmetric = input_symmetric
 
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition, input_size_per_partition, dtype=torch.int8
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+        else:
+            assert self.strategy == QuantizationStrategy.TENSOR
+            weight_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = PerTensorScaleParameter(
+                data=torch.empty(1, dtype=torch.float32), weight_loader=weight_loader
+            )
+            layer.register_parameter("input_scale", input_scale)
+
+            if not self.input_symmetric:
+                # Note: compressed-tensors stores the zp using the same dtype
+                # as the weights
+                # AZP loaded as int8 but used as int32
+                input_zero_point = PerTensorScaleParameter(
+                    data=torch.empty(1, dtype=torch.int8), weight_loader=weight_loader
+                )
+                layer.register_parameter("input_zero_point", input_zero_point)
+
+
+class GPUCompressedTensorsW8A8Int8(CompressedTensorsScheme):
+    
+    def __init__(
+        self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
+    ):
+        super.__init__(
+            strategy,
+            is_static_input_scheme,
+            input_symmetric
+        )
+
     @classmethod
     def get_min_capability(cls) -> int:
         # lovelace and up
@@ -107,61 +178,6 @@ def process_weights_after_loading(self, layer) -> None:
         else:
             layer.azp_adj = None
 
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        output_partition_sizes: list[int],
-        input_size_per_partition: int,
-        params_dtype: torch.dtype,
-        weight_loader: Callable,
-        **kwargs,
-    ):
-        output_size_per_partition = sum(output_partition_sizes)
-        layer.logical_widths = output_partition_sizes
-
-        # WEIGHT
-        weight = ModelWeightParameter(
-            data=torch.empty(
-                output_size_per_partition, input_size_per_partition, dtype=torch.int8
-            ),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-
-        layer.register_parameter("weight", weight)
-
-        # WEIGHT SCALE
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
-                output_dim=0,
-                weight_loader=weight_loader,
-            )
-        else:
-            assert self.strategy == QuantizationStrategy.TENSOR
-            weight_scale = PerTensorScaleParameter(
-                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
-                weight_loader=weight_loader,
-            )
-        layer.register_parameter("weight_scale", weight_scale)
-
-        # INPUT SCALE
-        if self.is_static_input_scheme:
-            input_scale = PerTensorScaleParameter(
-                data=torch.empty(1, dtype=torch.float32), weight_loader=weight_loader
-            )
-            layer.register_parameter("input_scale", input_scale)
-
-            if not self.input_symmetric:
-                # Note: compressed-tensors stores the zp using the same dtype
-                # as the weights
-                # AZP loaded as int8 but used as int32
-                input_zero_point = PerTensorScaleParameter(
-                    data=torch.empty(1, dtype=torch.int8), weight_loader=weight_loader
-                )
-                layer.register_parameter("input_zero_point", input_zero_point)
-
     def apply_weights(
         self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
     ) -> torch.Tensor:
@@ -171,3 +187,32 @@ def apply_weights(
         return int8_scaled_mm(
             x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias
         )
+
+
+class NPUCompressedTensorsW8A8Int8(CompressedTensorsScheme):
+    
+    def __init__(
+        self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
+    ):
+        super.__init__(
+            strategy,
+            is_static_input_scheme,
+            input_symmetric
+        )
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return NotImplementedError
+
+    def process_weights_after_loading(self, layer):
+        if self.is_static_input_scheme:
+            return NPUW8A8Int8LinearMethod.process_weights_after_loading(layer)
+        else:
+            return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
+    
+    def apply_weights(self, layer, x, bias):
+        if self.is_static_input_scheme:
+            return NPUW8A8Int8LinearMethod.apply(layer)
+        else:
+            return NPUW8A8Int8DynamicLinearMethod.apply(layer)
+
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index b28893f0f42e..0e250ea8c573 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -189,7 +189,7 @@ def get_quant_method(
                 ModelSlimLinearMethod(self)
             )
         elif isinstance(layer, FusedMoE):
-            return ModelSlimMoeMethod.get_moe_method(self, layer, prefix)
+            return ModelSlimMoEMethod.get_moe_method(self, layer, prefix)
         return None
 
     def _get_scheme_from_parts(
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 5dd239a6d1ab..c12dcf39fd47 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -153,36 +153,8 @@ def create_weights(
         layer.register_parameter("w2_weight_offset", w2_weight_offset)
         set_weight_attrs(w2_weight_offset, extra_weight_attrs)
 
-    def release_weight_cache(self, weight: torch.Tensor):
-        # .contiguous() introduces additional memory overhead and needs to be released using resize_(0)
-        origin_weight = weight.data.transpose(1, 2)
-        new_weight = origin_weight.contiguous()
-        origin_weight.untyped_storage().resize_(0)
-        return new_weight
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        weight_data = self.release_weight_cache(layer.w13_weight.data)
-        layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False)
-
-        weight_data = self.release_weight_cache(layer.w2_weight.data)
-        layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False)
-
-        layer.w13_weight_scale = torch.nn.Parameter(
-            layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32),
-            requires_grad=False,
-        )
-        layer.w2_weight_scale = torch.nn.Parameter(
-            layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False
-        )
-        layer.w13_weight_offset = torch.nn.Parameter(
-            layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
-        )
-        layer.w2_weight_offset = torch.nn.Parameter(
-            layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
-        )
-
-        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
-        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
+        NPUW8A8Int8DynamicMoEMethod.process_weights_after_loading(layer)
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
index 87404a6269be..1d633fcbb06a 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
@@ -30,7 +30,6 @@ def __init__(
         self, quant_config: Dict[str, any], prefix: str,
     ):
         self.quant_config = quant_config
-        self.transpose_weight = True
         self.is_dynamic = (
             self.quant_config[prefix + ".weight"]
             == "W4A4_DYNAMIC"
@@ -77,14 +76,7 @@ def create_weights(
             set_weight_attrs(param, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer):
-        if self.transpose_weight:
-            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        layer.weight_scale.data = layer.weight_scale.data.flatten()
-        layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32)
-        layer.weight_offset.data = layer.weight_offset.data.flatten()
-        layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack(
-            layer.weight.data.to(torch.int32)
-        )
+        NPU_W4A4DynamicLinearMethod.process_weights_after_loading(layer)
     
     def apply_weights(
         self,
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
index 7963c87200c4..b33764b858a9 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
@@ -107,26 +107,10 @@ def create_weights(
             layer.register_parameter("deq_scale", deq_scale)
 
     def process_weights_after_loading(self, layer: torch.nn.Module):
-        layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        layer.weight.data = npu_format_cast(layer.weight.data)
-
-        layer.weight_scale.data = layer.weight_scale.data.flatten()
-        layer.weight_offset.data = layer.weight_offset.data.flatten()
-
-        if not self.is_dynamic:
-            expanding_factor = layer.weight.data.shape[0]
-            layer.aclnn_input_scale = torch.nn.Parameter(
-                layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
-                requires_grad=False,
-            )
-            layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
-                layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
-                requires_grad=False,
-            )
-            layer.aclnn_input_offset = torch.nn.Parameter(
-                layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
-                requires_grad=False,
-            )
+        if self.is_dynamic:
+            NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
+        else:
+            NPUW8A8Int8LinearMethod.process_weights_after_loading(layer)
     
     def apply_weights(
         self,

From c42c8f1be2abc9c32c478a36467aab1518674bf6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 12 Dec 2025 16:38:38 +0300
Subject: [PATCH 021/175] Added check for empty scheme

---
 .../sglang/srt/layers/quantization/msmodelslim/msmodelslim.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index 0e250ea8c573..383ca74c0f02 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -183,7 +183,7 @@ def get_quant_method(
                 return UnquantizedLinearMethod()
             scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config)
             if scheme is None:
-                return UnquantizedLinearMethod()
+                raise NotImplementedError("At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes.")
             layer.scheme = scheme
             return (
                 ModelSlimLinearMethod(self)

From 25d0d09c1fba729f2c06080f07626b5269e422b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 12 Dec 2025 16:50:32 +0300
Subject: [PATCH 022/175] Remove unnecessary method

---
 .../msmodelslim/schemes/msmodelslim_scheme.py             | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py
index 7e6669abe412..1d09c384ca9e 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py
@@ -15,14 +15,6 @@ class ModelSlimScheme(ABC):
     of different quantization schemes supported by CompressedTensors.
     """
 
-    @classmethod
-    @abstractmethod
-    def get_min_capability(cls) -> int:
-        """
-        Get minimum device capability.
-        """
-        raise NotImplementedError
-
     @abstractmethod
     def create_weights(self, *args, **kwargs):
         """

From ca4895ed635eaff99e23ceaf37274e564908d561 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 12 Dec 2025 17:47:24 +0300
Subject: [PATCH 023/175] Add w4a8 support

---
 .../msmodelslim/msmodelslim_moe.py            | 212 ++++++++++++++++--
 1 file changed, 194 insertions(+), 18 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index c12dcf39fd47..3b7d5172541e 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -17,6 +17,7 @@
 )
 from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+    NPUW4A8Int8DynamicMoEMethod,
     NPUW8A8Int8DynamicMoEMethod,
 )
 
@@ -37,6 +38,7 @@
 
 __all__ = [
     "ModelSlimMoEMethod",
+    "ModelSlimW4A8Int8MoE",
     "ModelSlimW8A8Int8MoE",
 ]
 
@@ -56,24 +58,198 @@ def get_moe_method(
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
 
+        prefix_in_quant_config = prefix + ".0.down_proj.weight"
+        is_moe_w4a8_dynamic = (
+            quant_config.quant_description.get(prefix_in_quant_config, "STATIC")
+            == "W4A8_DYNAMIC"
+        )
+
+        if is_moe_w4a8_dynamic:
+            return ModelSlimW4A8Int8MoE(quant_config)
+
         return ModelSlimW8A8Int8MoE(quant_config)
-        # weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
-        # input_quant = quant_config.target_scheme_map["Linear"].get("input_activations")
-        # is_moe_w4_dynamic = quant_config.is_dynamic_token_w4(weight_quant, input_quant)
-        # is_moe_input_quant = input_quant
-
-        # if (
-        #     is_moe_w4_dynamic and is_moe_input_quant is not None
-        # ) or quant_config._is_moe_w4a8_dynamic(prefix, weight_quant, input_quant):
-        #     return NPUW4A8Int4DynamicMoEMethod(quant_config)
-        # elif is_moe_w4_dynamic and is_moe_input_quant is None:
-        #     return NPUW4A16Int4DynamicMoEMethod(quant_config)
-        # else:
-        #     return NPUW8A8Int8DynamicMoEMethod(quant_config)
-        # else:
-        #     raise RuntimeError(
-        #         f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
-        #     )
+
+
+class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod):
+
+    def __init__(
+            self, quant_config: Dict[str, Any], prefix: str = None,
+    ):
+        self.quant_config = quant_config
+        self.group_size = 0
+        self.tp_size = 1
+        self.is_per_channel_weight = self.group_size == 0
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        self.num_experts = num_experts
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+
+        # >> weight
+        w13_output_size = intermediate_size_per_partition
+        w2_output_size = hidden_size // 2
+        w13_weight = torch.nn.Parameter(
+            torch.empty(num_experts, w13_output_size, hidden_size, dtype=torch.int8),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                w2_output_size,
+                intermediate_size_per_partition,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # >> scale
+        w13_weight_scale = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+        w2_weight_scale = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # >> offset
+        w13_weight_offset = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_offset", w13_weight_offset)
+        set_weight_attrs(w13_weight_offset, extra_weight_attrs)
+
+        w2_weight_offset = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_offset", w2_weight_offset)
+        set_weight_attrs(w2_weight_offset, extra_weight_attrs)
+
+        if not self.is_per_channel_weight:
+            # >>> special param for w4a8
+            w13_weight_scale_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    hidden_size // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second)
+            set_weight_attrs(w13_weight_scale_second, extra_weight_attrs)
+            w13_weight_offset_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    hidden_size // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_offset_second", w13_weight_offset_second)
+            set_weight_attrs(w13_weight_offset_second, extra_weight_attrs)
+
+            w2_weight_scale_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_weight_scale_second", w2_weight_scale_second)
+            set_weight_attrs(w2_weight_scale_second, extra_weight_attrs)
+
+            w2_weight_offset_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_weight_offset_second", w2_weight_offset_second)
+            set_weight_attrs(w2_weight_offset_second, extra_weight_attrs)
+
+        w13_scale_bias = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scale_bias", w13_scale_bias)
+        set_weight_attrs(w13_scale_bias, extra_weight_attrs)
+
+        w2_scale_bias = torch.nn.Parameter(
+            torch.empty(
+                num_experts, hidden_size, 16 // self.tp_size, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scale_bias", w2_scale_bias)
+        set_weight_attrs(w2_scale_bias, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        print(layer)
+        NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(layer)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
+    ):
+        self.moe_runner_config = moe_runner_config
+    
+    def apply(
+        self,
+        layer,
+        dispatch_output: "StandardDispatchOutput",
+    ) -> "CombineInput":
+        return NPUW4A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
+    
+
+    def apply_without_routing_weights(
+        self,
+        layer,
+        hidden_states,
+        hidden_states_scale,
+        group_list_type,
+        group_list,
+        output_dtype,
+    ):
+        return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(layer,
+                                                                        hidden_states,
+                                                                        hidden_states_scale,
+                                                                        group_list_type,
+                                                                        group_list,
+                                                                        output_dtype,)
 
 
 class ModelSlimW8A8Int8MoE(ModelSlimMoEMethod):
@@ -183,4 +359,4 @@ def apply_without_routing_weights(
                                                                         hidden_states_scale,
                                                                         group_list_type,
                                                                         group_list,
-                                                                        output_dtype,)
\ No newline at end of file
+                                                                        output_dtype,)

From 28ff8e09539e492ab1b8dfd6e0ca2fe435034534 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 12 Dec 2025 17:48:48 +0300
Subject: [PATCH 024/175] Add w4a8 support (kernel)

---
 .../npu/quantization/fused_moe_method_npu.py  | 254 ++++++++----------
 1 file changed, 106 insertions(+), 148 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 5b5098ed567c..ac32ad5035b0 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -247,151 +247,22 @@ def apply_without_routing_weights(
         return hidden_states
 
 
-class NPUW4A8Int4DynamicMoEMethod(FusedMoEMethodBase):
+class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase):
 
     def __init__(self) -> None:
-        self.group_size = 256
+        self.group_size = 0 ### TODO or 256
         self.tp_size = 1
+        self.is_per_channel_weight = self.group_size == 0
 
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ) -> None:
-        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
-
-        self.num_experts = num_experts
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
-        )
-
-        # >> weight
-        w13_output_size = intermediate_size_per_partition
-        w2_output_size = hidden_size // 2
-        w13_weight = torch.nn.Parameter(
-            torch.empty(num_experts, w13_output_size, hidden_size, dtype=torch.int8),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                w2_output_size,
-                intermediate_size_per_partition,
-                dtype=torch.int8,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-
-        # >> scale
-        w13_weight_scale = torch.nn.Parameter(
-            torch.empty(
-                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-
-        w2_weight_scale = torch.nn.Parameter(
-            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-
-        # >> offset
-        w13_weight_offset = torch.nn.Parameter(
-            torch.empty(
-                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight_offset", w13_weight_offset)
-        set_weight_attrs(w13_weight_offset, extra_weight_attrs)
-
-        w2_weight_offset = torch.nn.Parameter(
-            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_offset", w2_weight_offset)
-        set_weight_attrs(w2_weight_offset, extra_weight_attrs)
-
-        # >>> special param for w4a8
-        w13_weight_scale_second = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size // self.group_size,
-                dtype=torch.float32,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second)
-        set_weight_attrs(w13_weight_scale_second, extra_weight_attrs)
-        w13_weight_offset_second = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size // self.group_size,
-                dtype=torch.float32,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight_offset_second", w13_weight_offset_second)
-        set_weight_attrs(w13_weight_offset_second, extra_weight_attrs)
-
-        w2_weight_scale_second = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition // self.group_size,
-                dtype=torch.float32,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_scale_second", w2_weight_scale_second)
-        set_weight_attrs(w2_weight_scale_second, extra_weight_attrs)
-
-        w2_weight_offset_second = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition // self.group_size,
-                dtype=torch.float32,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_offset_second", w2_weight_offset_second)
-        set_weight_attrs(w2_weight_offset_second, extra_weight_attrs)
-
-        w13_scale_bias = torch.nn.Parameter(
-            torch.empty(
-                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_scale_bias", w13_scale_bias)
-        set_weight_attrs(w13_scale_bias, extra_weight_attrs)
-
-        w2_scale_bias = torch.nn.Parameter(
-            torch.empty(
-                num_experts, hidden_size, 16 // self.tp_size, dtype=torch.float32
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_scale_bias", w2_scale_bias)
-        set_weight_attrs(w2_scale_bias, extra_weight_attrs)
-
+    @classmethod
     def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
         scale = scale.transpose(1, 2).contiguous()
+        if self.is_per_channel_weight:
+            scale_np = scale.cpu().numpy()
+            scale_np.dtype = np.uint32
+            scale_uint64_tensor = torch.from_numpy(scale_np.astype(
+                np.int64)).npu()
+            return scale_uint64_tensor, None
         per_group_scale = per_group_scale.transpose(1, 2).contiguous()
         group_num, k, n = weight.shape
         # the weight of the new version is reduced by half by pack n, so it needs to be restored
@@ -416,6 +287,7 @@ def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
         sscale_uint64_tensor = sscale_uint64_tensor.npu()
         return sscale_uint64_tensor, bias
 
+    @classmethod
     def update_bias(self, layer, w13_bias, w2_bias):
         layer.w13_scale_bias.data = (
             layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
@@ -424,6 +296,7 @@ def update_bias(self, layer, w13_bias, w2_bias):
             layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
         )
 
+    @classmethod
     def pack_to_int32(self, weight: torch.Tensor):
         # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
         assert (
@@ -431,6 +304,7 @@ def pack_to_int32(self, weight: torch.Tensor):
         ), "the last dim of weight needs to be divided by 4"
         return weight.view(torch.int32).contiguous()
 
+    @classmethod
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.w13_weight = torch.nn.Parameter(
             layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False
@@ -469,21 +343,105 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
         layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
 
-    def create_moe_runner(
-        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
-    ):
-        self.moe_runner_config = moe_runner_config
-
+    @classmethod
     def apply(
         self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
-        # FIXME W4A8 only support with deepep
-        raise NotImplementedError(
-            f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep"
-        )
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
 
+        hidden_states = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        topk_weights, topk_ids, _ = topk_output
+        top_k=topk_ids.shape[1]
+        group_list_type = 1
+
+        self.original_shape = hidden_states.shape
+        self.topk_weights = topk_weights
+        
+        num_tokens = hidden_states.shape[:-1].numel()
+
+        first_expert_idx = 0
+        last_expert_idx = 128
+        global_num_experts = 128
+
+        sorted_hidden_states, self.expanded_row_idx, expert_tokens, pertoken_scale = (
+            torch.ops.npu.npu_moe_init_routing_v2(
+                hidden_states,
+                topk_ids,
+                active_num=num_tokens * top_k,
+                expert_num=global_num_experts,
+                expert_tokens_num_type=1,
+                expert_tokens_num_flag=True,
+                active_expert_range=[first_expert_idx, last_expert_idx],
+                quant_mode=1,
+            ))
+
+        expert_tokens = expert_tokens.to(torch.int64)
+
+        bias1 = [layer.w13_scale_bias]
+        bias2 = [layer.w2_scale_bias]
+        w1_scale = [layer.w13_weight_scale]
+        w2_scale = [layer.w2_weight_scale]
+        # TODO w4a8 scene: dynamic acquisition of dtype in the future
+        _output_dtype = torch.bfloat16
+        
+        hidden_states = torch.ops.npu.npu_grouped_matmul(
+            x=[sorted_hidden_states],
+            weight=[layer.w13_weight],
+            scale=w1_scale,
+            bias=bias1,
+            per_token_scale=[pertoken_scale],
+            group_list=expert_tokens,
+            split_item=2,
+            group_type=0,
+            group_list_type=group_list_type,
+            output_dtype=_output_dtype,
+        )[0]
+
+        # act_fn: swiglu
+        hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
+        hidden_states, swiglu_out_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
+
+        output = torch.ops.npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[layer.w2_weight],
+            scale=w2_scale,
+            bias=bias2,
+            per_token_scale=[swiglu_out_scale],
+            group_list=expert_tokens,
+            split_item=2,
+            group_type=0,
+            group_list_type=group_list_type,
+            output_dtype=_output_dtype,
+        )[0]
+
+        final_hidden_states = self.token_combine(hidden_states=output)
+
+        return StandardCombineInput(hidden_states=final_hidden_states)
+
+    @classmethod
+    def token_combine(self, 
+                      hidden_states: torch.Tensor,
+                      bias: torch.Tensor = None):
+        assert self.original_shape is not None
+        final_hidden_states = torch.ops.npu.npu_moe_token_unpermute(
+            permuted_tokens=hidden_states,
+            sorted_indices=torch.abs(self.expanded_row_idx),
+            probs=self.topk_weights)
+        if len(self.original_shape) == 3:
+            final_hidden_states = final_hidden_states.view(self.original_shape)
+
+        # these values are no longer used, so they need to be set to None for memory release.
+        self.expert_map = None
+        self.topk_weights = None
+        self.topk_ids = None
+        self.expanded_row_idx = None
+        return final_hidden_states
+
+    @classmethod
     def apply_without_routing_weights(
         self,
         layer,

From d9412d4fadde6f9fcf4e94fcd5f157a7f1d58536 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Fri, 12 Dec 2025 18:21:47 +0300
Subject: [PATCH 025/175] Update fused_moe_method_npu.py

---
 .../npu/quantization/fused_moe_method_npu.py  | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index ac32ad5035b0..91a0c633e0db 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -254,7 +254,6 @@ def __init__(self) -> None:
         self.tp_size = 1
         self.is_per_channel_weight = self.group_size == 0
 
-    @classmethod
     def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
         scale = scale.transpose(1, 2).contiguous()
         if self.is_per_channel_weight:
@@ -287,7 +286,6 @@ def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
         sscale_uint64_tensor = sscale_uint64_tensor.npu()
         return sscale_uint64_tensor, bias
 
-    @classmethod
     def update_bias(self, layer, w13_bias, w2_bias):
         layer.w13_scale_bias.data = (
             layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
@@ -296,7 +294,6 @@ def update_bias(self, layer, w13_bias, w2_bias):
             layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
         )
 
-    @classmethod
     def pack_to_int32(self, weight: torch.Tensor):
         # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
         assert (
@@ -305,7 +302,7 @@ def pack_to_int32(self, weight: torch.Tensor):
         return weight.view(torch.int32).contiguous()
 
     @classmethod
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+    def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
         layer.w13_weight = torch.nn.Parameter(
             layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False
         )
@@ -323,10 +320,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             if hasattr(layer, "w2_weight_scale_second")
             else None
         )
-        layer.w13_weight_scale.data, w13_bias = self.process_scale(
+        layer.w13_weight_scale.data, w13_bias = cls.process_scale(
             layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second
         )
-        layer.w2_weight_scale.data, w2_bias = self.process_scale(
+        layer.w2_weight_scale.data, w2_bias = cls.process_scale(
             layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second
         )
         if hasattr(layer, "w13_weight_scale_second"):
@@ -336,7 +333,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             del layer.w13_weight_offset_second
             del layer.w2_weight_offset_second
 
-        self.update_bias(layer, w13_bias, w2_bias)
+        cls.update_bias(layer, w13_bias, w2_bias)
 
         layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
         layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
@@ -345,7 +342,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
     @classmethod
     def apply(
-        self,
+        cls,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
@@ -358,8 +355,8 @@ def apply(
         top_k=topk_ids.shape[1]
         group_list_type = 1
 
-        self.original_shape = hidden_states.shape
-        self.topk_weights = topk_weights
+        cls.original_shape = hidden_states.shape
+        cls.topk_weights = topk_weights
         
         num_tokens = hidden_states.shape[:-1].numel()
 
@@ -367,7 +364,7 @@ def apply(
         last_expert_idx = 128
         global_num_experts = 128
 
-        sorted_hidden_states, self.expanded_row_idx, expert_tokens, pertoken_scale = (
+        sorted_hidden_states, cls.expanded_row_idx, expert_tokens, pertoken_scale = (
             torch.ops.npu.npu_moe_init_routing_v2(
                 hidden_states,
                 topk_ids,
@@ -418,11 +415,10 @@ def apply(
             output_dtype=_output_dtype,
         )[0]
 
-        final_hidden_states = self.token_combine(hidden_states=output)
+        final_hidden_states = cls.token_combine(hidden_states=output)
 
         return StandardCombineInput(hidden_states=final_hidden_states)
 
-    @classmethod
     def token_combine(self, 
                       hidden_states: torch.Tensor,
                       bias: torch.Tensor = None):
@@ -441,9 +437,8 @@ def token_combine(self,
         self.expanded_row_idx = None
         return final_hidden_states
 
-    @classmethod
+    @staticmethod
     def apply_without_routing_weights(
-        self,
         layer,
         hidden_states,
         hidden_states_scale,

From 0f81db38feb3d0ef18d92f2676fb2ca3f2743ea9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 15 Dec 2025 11:50:57 +0300
Subject: [PATCH 026/175] Fix w8a8_static bug

---
 .../srt/layers/quantization/msmodelslim/msmodelslim.py       | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index 383ca74c0f02..d9e9805a1c0b 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -202,6 +202,11 @@ def _get_scheme_from_parts(
                 quant_config=self.quant_description,
                 prefix=layer_name
             )
+        elif quant_type == "W8A8":
+            return ModelSlimW8A8Int8(
+                quant_config=self.quant_description,
+                prefix=layer_name
+            )
         elif quant_type == "W4A4_DYNAMIC":
             return ModelSlimW4A4Int4(
                 quant_config=self.quant_description,

From 3175d8b30103cf1ddb62dc8e4b10e943ace5ec36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 15 Dec 2025 12:32:10 +0300
Subject: [PATCH 027/175] Improving the code structure

---
 .../srt/layers/quantization/msmodelslim/msmodelslim.py     | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index d9e9805a1c0b..4de4d04ec6b1 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -197,12 +197,7 @@ def _get_scheme_from_parts(
         ) -> ModelSlimScheme:
 
         quant_type = self.quant_description[layer_name + '.weight']
-        if quant_type == "W8A8_DYNAMIC":
-            return ModelSlimW8A8Int8(
-                quant_config=self.quant_description,
-                prefix=layer_name
-            )
-        elif quant_type == "W8A8":
+        if quant_type == "W8A8_DYNAMIC" or quant_type == "W8A8":
             return ModelSlimW8A8Int8(
                 quant_config=self.quant_description,
                 prefix=layer_name

From 23db53f937b1a2c1f14ca17d2eb8d1e12cb05880 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 15 Dec 2025 13:51:08 +0300
Subject: [PATCH 028/175] Delete print()

---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 3b7d5172541e..fe83dc771117 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -219,7 +219,6 @@ def create_weights(
         set_weight_attrs(w2_scale_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        print(layer)
         NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(layer)
 
     def create_moe_runner(

From 393f7d1d2167aee713c6852d2d9c6b106eac6379 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 15 Dec 2025 13:52:23 +0300
Subject: [PATCH 029/175] Update w4a8 for MOE

---
 .../npu/quantization/fused_moe_method_npu.py  | 61 ++++++++-----------
 1 file changed, 24 insertions(+), 37 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 91a0c633e0db..54622a1e0873 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -142,7 +142,8 @@ def npu_fused_moe_without_routing_weights_bf16(
 
 class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase):
 
-    def release_weight_cache(self, weight: torch.Tensor):
+    @classmethod
+    def release_weight_cache(cls, weight: torch.Tensor):
         # .contiguous() introduces additional memory overhead and needs to be released using resize_(0)
         origin_weight = weight.data.transpose(1, 2)
         new_weight = origin_weight.contiguous()
@@ -249,14 +250,11 @@ def apply_without_routing_weights(
 
 class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase):
 
-    def __init__(self) -> None:
-        self.group_size = 0 ### TODO or 256
-        self.tp_size = 1
-        self.is_per_channel_weight = self.group_size == 0
-
-    def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
+    @classmethod
+    def process_scale(cls, weight: torch.Tensor, scale, per_group_scale):
         scale = scale.transpose(1, 2).contiguous()
-        if self.is_per_channel_weight:
+        #if cls.is_per_channel_weight:
+        if True:
             scale_np = scale.cpu().numpy()
             scale_np.dtype = np.uint32
             scale_uint64_tensor = torch.from_numpy(scale_np.astype(
@@ -286,7 +284,8 @@ def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
         sscale_uint64_tensor = sscale_uint64_tensor.npu()
         return sscale_uint64_tensor, bias
 
-    def update_bias(self, layer, w13_bias, w2_bias):
+    @classmethod
+    def update_bias(cls, layer, w13_bias, w2_bias):
         layer.w13_scale_bias.data = (
             layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
         )
@@ -294,7 +293,8 @@ def update_bias(self, layer, w13_bias, w2_bias):
             layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
         )
 
-    def pack_to_int32(self, weight: torch.Tensor):
+    @classmethod
+    def pack_to_int32(cls, weight: torch.Tensor):
         # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
         assert (
             weight.shape[-1] % 4 == 0
@@ -337,12 +337,11 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
 
         layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
         layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
-        layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
-        layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
+        layer.w13_weight.data = cls.pack_to_int32(layer.w13_weight.data)
+        layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data)
 
-    @classmethod
+    staticmethod
     def apply(
-        cls,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
@@ -354,9 +353,8 @@ def apply(
         topk_weights, topk_ids, _ = topk_output
         top_k=topk_ids.shape[1]
         group_list_type = 1
-
-        cls.original_shape = hidden_states.shape
-        cls.topk_weights = topk_weights
+        original_shape = hidden_states.shape
+        topk_weights = topk_weights
         
         num_tokens = hidden_states.shape[:-1].numel()
 
@@ -364,7 +362,7 @@ def apply(
         last_expert_idx = 128
         global_num_experts = 128
 
-        sorted_hidden_states, cls.expanded_row_idx, expert_tokens, pertoken_scale = (
+        sorted_hidden_states, expanded_row_idx, expert_tokens, pertoken_scale = (
             torch.ops.npu.npu_moe_init_routing_v2(
                 hidden_states,
                 topk_ids,
@@ -415,30 +413,19 @@ def apply(
             output_dtype=_output_dtype,
         )[0]
 
-        final_hidden_states = cls.token_combine(hidden_states=output)
+        assert original_shape is not None
+        final_hidden_states = torch.ops.npu.npu_moe_token_unpermute(
+            permuted_tokens=output,
+            sorted_indices=torch.abs(expanded_row_idx),
+            probs=topk_weights)
+        if len(original_shape) == 3:
+            final_hidden_states = final_hidden_states.view(original_shape)
 
         return StandardCombineInput(hidden_states=final_hidden_states)
 
-    def token_combine(self, 
-                      hidden_states: torch.Tensor,
-                      bias: torch.Tensor = None):
-        assert self.original_shape is not None
-        final_hidden_states = torch.ops.npu.npu_moe_token_unpermute(
-            permuted_tokens=hidden_states,
-            sorted_indices=torch.abs(self.expanded_row_idx),
-            probs=self.topk_weights)
-        if len(self.original_shape) == 3:
-            final_hidden_states = final_hidden_states.view(self.original_shape)
-
-        # these values are no longer used, so they need to be set to None for memory release.
-        self.expert_map = None
-        self.topk_weights = None
-        self.topk_ids = None
-        self.expanded_row_idx = None
-        return final_hidden_states
-
     @staticmethod
     def apply_without_routing_weights(
+        cls,
         layer,
         hidden_states,
         hidden_states_scale,

From d4d53e084e3f13cbc6c9d16e8412fe121a828aea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 16 Dec 2025 14:42:47 +0300
Subject: [PATCH 030/175] Fix w4a4 weights loading

---
 .../schemes/msmodelslim_w4a4_int4.py          | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
index 1d633fcbb06a..3bbbf4af1f2d 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
@@ -34,6 +34,23 @@ def __init__(
             self.quant_config[prefix + ".weight"]
             == "W4A4_DYNAMIC"
         )
+        
+    @staticmethod
+    def get_weight(
+        input_size: int, output_size: int, params_dtype: torch.dtype
+    ) -> Dict[str, Any]:
+        params_dict = {"weight": torch.empty(output_size, input_size, dtype=torch.int8)}
+        return params_dict
+
+    @staticmethod
+    def get_perchannel_param(
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        params_dict = {}
+        params_dict["weight_scale"] = torch.empty(output_size, 1, dtype=params_dtype)
+        params_dict["weight_offset"] = torch.empty(output_size, 1, dtype=params_dtype)
+        return params_dict
 
     def create_weights(
         self,
@@ -47,17 +64,15 @@ def create_weights(
     ) -> None:
         output_size_per_partition = sum(output_partition_sizes)
         weight_loader = extra_weight_attrs.get("weight_loader")
-
-        weight_dict = self.quant_method.get_weight(
-            input_size_per_partition, output_size_per_partition, params_dtype
-        )
+        
+        weight_dict = {"weight": torch.empty(output_size_per_partition, input_size_per_partition, dtype=torch.int8)}
         for weight_name, weight_param in weight_dict.items():
             param = torch.nn.Parameter(weight_param, requires_grad=False)
             set_weight_attrs(param, {"input_dim": 1, "output_dim": 0})
             layer.register_parameter(weight_name, param)
             set_weight_attrs(param, extra_weight_attrs)
 
-        pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
+        pertensor_dict = {}
         for pertensor_name, pertensor_param in pertensor_dict.items():
             param = PerTensorScaleParameter(
                 data=pertensor_param, weight_loader=weight_loader
@@ -65,10 +80,10 @@ def create_weights(
             # disable warning
             param.ignore_warning = True
             layer.register_parameter(pertensor_name, param)
-
-        perchannel_dict = self.quant_method.get_perchannel_param(
-            output_size_per_partition, params_dtype
-        )
+        
+        perchannel_dict = {}
+        perchannel_dict["weight_scale"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype)
+        perchannel_dict["weight_offset"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype)
         for perchannel_name, perchannel_param in perchannel_dict.items():
             param = torch.nn.Parameter(perchannel_param, requires_grad=False)
             set_weight_attrs(param, {"output_dim": 0})
@@ -84,4 +99,4 @@ def apply_weights(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        return NPU_W4A4DynamicLinearMethod.apply(layer, x, bias)
\ No newline at end of file
+        return NPU_W4A4DynamicLinearMethod.apply(layer, x, bias)

From 2bb7acf021d69c3badb695dc18fb71f682d85d4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 16 Dec 2025 14:48:03 +0300
Subject: [PATCH 031/175] Update model_config.py

---
 python/sglang/srt/configs/model_config.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 1361d58abe41..991df9756b7f 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -589,12 +589,11 @@ def _parse_quant_hf_config(self):
         return quant_cfg
 
     def _find_quant_modelslim_config(self):
-        quant_config_file = Path(self.model_path, "quant_model_description.json")
-        if quant_config_file.is_file():
-            with open(quant_config_file) as f:
+        quant_config_file = Path(self.model_path, "quant_model_description.json")   
+        quant_cfg = None 
+        if quant_config_file.is_file(): 
+            with open(quant_config_file) as f: 
                 quant_cfg = json.load(f)
-        else:
-            quant_cfg = None
                             
         return quant_cfg
 

From 4a05e5d361f1ce179d3e12317bf1a451fc5602f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 16 Dec 2025 14:52:50 +0300
Subject: [PATCH 032/175] Add w4a4 test

---
 .../ascend/test_ascend_w4a4_quantization.py   | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 test/manual/ascend/test_ascend_w4a4_quantization.py

diff --git a/test/manual/ascend/test_ascend_w4a4_quantization.py b/test/manual/ascend/test_ascend_w4a4_quantization.py
new file mode 100644
index 000000000000..c2251ec94a9d
--- /dev/null
+++ b/test/manual/ascend/test_ascend_w4a4_quantization.py
@@ -0,0 +1,108 @@
+"""
+Usage:
+python3 -m unittest test_ascend_w4a4_quantization.TestAscendW4A4.test_gsm8k
+"""
+
+import os
+import time
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+    7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
+)
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
+
+
+class TestAscendW4A4(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "/root/.cache/modelscope/hub/models/msit/Qwen3-8B-W4A4/"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--device",
+                "npu",
+                "--attention-backend",
+                "ascend",
+                "--tp-size",
+                "2",
+                "--mem-fraction-static",
+                "0.8",
+                "--cuda-graph-bs",
+                "64",
+                "--disable-radix-cache",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        base_url = DEFAULT_URL_FOR_TEST
+        url = urlparse(base_url)
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=128,
+            max_new_tokens=512,
+            parallel=64,
+            host=f"http://{url.hostname}",
+            port=int(url.port),
+        )
+        metrics = run_eval(args)
+        print(metrics)
+
+        self.assertGreaterEqual(metrics["accuracy"], 0.75)
+        self.assertGreaterEqual(metrics["output_throughput"], 700)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+                "ignore_eos": True,
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        max_tokens = 256
+
+        tic = time.perf_counter()
+        res = self.run_decode(max_tokens)
+        tok = time.perf_counter()
+        print(res["text"])
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+
+        if is_in_ci():
+            self.assertGreaterEqual(throughput, 25)
+
+
+if __name__ == "__main__":
+    unittest.main()

From d0a577fb4ff0cdd9856dfdca51f7a4c19f1d2d3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 16 Dec 2025 14:54:29 +0300
Subject: [PATCH 033/175] Add compressed-tensors unit-test

---
 .../ascend/test_ascend_w8a8_quantization.py   | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/test/manual/ascend/test_ascend_w8a8_quantization.py b/test/manual/ascend/test_ascend_w8a8_quantization.py
index bf139f46a872..5c1bc3b66562 100644
--- a/test/manual/ascend/test_ascend_w8a8_quantization.py
+++ b/test/manual/ascend/test_ascend_w8a8_quantization.py
@@ -98,6 +98,76 @@ def test_throughput(self):
 
         if is_in_ci():
             self.assertGreaterEqual(throughput, 25)
+            
+
+class TestAscendW8A8CompressedTensors(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        #TODO: Move model to CI or Modelscope
+        cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--disable-cuda-graph",
+                "--device",
+                "npu",
+                "--attention-backend",
+                "ascend",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        base_url = DEFAULT_URL_FOR_TEST
+        url = urlparse(base_url)
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host=f"http://{url.hostname}",
+            port=int(url.port),
+        )
+        metrics = run_eval(args)
+        print(metrics)
+
+        self.assertGreaterEqual(metrics["accuracy"], 0.3)
+        self.assertGreaterEqual(metrics["output_throughput"], 1000)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+                "ignore_eos": True,
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        max_tokens = 256
+
+        tic = time.perf_counter()
+        res = self.run_decode(max_tokens)
+        tok = time.perf_counter()
+        print(res["text"])
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+
+        if is_in_ci():
+            self.assertGreaterEqual(throughput, 25)
 
 
 if __name__ == "__main__":

From 77a923e4ea63dc6b55c49a5fdbba9ac1fc777fe3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A1=D0=B0=D0=B2=D0=BA=D0=B8=D0=BD=20=D0=90=D1=80=D1=82?=
 =?UTF-8?q?=D0=B5=D0=BC?= <savkinartem@MacBook-Air-Viktoria.local>
Date: Wed, 17 Dec 2025 13:25:44 +0300
Subject: [PATCH 034/175] Pre-commit fixes

---
 python/sglang/srt/configs/model_config.py     |  17 +-
 .../npu/quantization/fused_moe_method_npu.py  |  20 ++-
 .../npu/quantization/linear_method_npu.py     |  11 +-
 .../compressed_tensors/compressed_tensors.py  |   4 +-
 .../schemes/compressed_tensors_w8a8_int8.py   |  27 +--
 .../quantization/msmodelslim/msmodelslim.py   | 158 +++++++++---------
 .../msmodelslim/msmodelslim_moe.py            |  63 ++++---
 .../msmodelslim/schemes/__init__.py           |   2 +-
 .../schemes/msmodelslim_w4a4_int4.py          |  48 +++---
 .../schemes/msmodelslim_w8a8_int8.py          |  27 ++-
 .../ascend/test_ascend_w8a8_quantization.py   |   4 +-
 11 files changed, 175 insertions(+), 206 deletions(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index a4253b9f59ce..8e7e98a77d0b 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -16,8 +16,8 @@
 import logging
 import math
 import os
-from pathlib import Path
 from enum import Enum, IntEnum, auto
+from pathlib import Path
 from typing import Any, List, Optional, Set, Union
 
 import torch
@@ -26,7 +26,7 @@
 from sglang.srt.environ import envs
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import is_hip, retry, is_npu
+from sglang.srt.utils import is_hip, is_npu, retry
 from sglang.srt.utils.hf_transformers_utils import (
     get_config,
     get_context_length,
@@ -39,6 +39,7 @@
 logger = logging.getLogger(__name__)
 _is_npu = is_npu()
 
+
 class AttentionArch(IntEnum):
     MLA = auto()
     MHA = auto()
@@ -596,12 +597,12 @@ def _parse_quant_hf_config(self):
         return quant_cfg
 
     def _find_quant_modelslim_config(self):
-        quant_config_file = Path(self.model_path, "quant_model_description.json")   
-        quant_cfg = None 
-        if quant_config_file.is_file(): 
-            with open(quant_config_file) as f: 
+        quant_config_file = Path(self.model_path, "quant_model_description.json")
+        quant_cfg = None
+        if quant_config_file.is_file():
+            with open(quant_config_file) as f:
                 quant_cfg = json.load(f)
-                            
+
         return quant_cfg
 
     def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]:
@@ -724,7 +725,7 @@ def _verify_quantization(self) -> None:
         quant_cfg = self._parse_quant_hf_config()
         if _is_npu:
             quant_cfg = self._find_quant_modelslim_config()
-            self.quantization = 'modelslim'
+            self.quantization = "modelslim"
 
         if quant_cfg is not None:
             quant_method = quant_cfg.get(
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 54622a1e0873..dfbab790d1ed 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -199,7 +199,7 @@ def apply(
             top_k=topk_ids.shape[1],
         )
         return StandardCombineInput(hidden_states=output)
-    
+
     @staticmethod
     def apply_without_routing_weights(
         layer,
@@ -253,12 +253,11 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase):
     @classmethod
     def process_scale(cls, weight: torch.Tensor, scale, per_group_scale):
         scale = scale.transpose(1, 2).contiguous()
-        #if cls.is_per_channel_weight:
+        # if cls.is_per_channel_weight:
         if True:
             scale_np = scale.cpu().numpy()
             scale_np.dtype = np.uint32
-            scale_uint64_tensor = torch.from_numpy(scale_np.astype(
-                np.int64)).npu()
+            scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu()
             return scale_uint64_tensor, None
         per_group_scale = per_group_scale.transpose(1, 2).contiguous()
         group_num, k, n = weight.shape
@@ -341,6 +340,7 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
         layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data)
 
     staticmethod
+
     def apply(
         layer,
         dispatch_output: "StandardDispatchOutput",
@@ -351,11 +351,11 @@ def apply(
         topk_output = dispatch_output.topk_output
 
         topk_weights, topk_ids, _ = topk_output
-        top_k=topk_ids.shape[1]
+        top_k = topk_ids.shape[1]
         group_list_type = 1
         original_shape = hidden_states.shape
         topk_weights = topk_weights
-        
+
         num_tokens = hidden_states.shape[:-1].numel()
 
         first_expert_idx = 0
@@ -372,7 +372,8 @@ def apply(
                 expert_tokens_num_flag=True,
                 active_expert_range=[first_expert_idx, last_expert_idx],
                 quant_mode=1,
-            ))
+            )
+        )
 
         expert_tokens = expert_tokens.to(torch.int64)
 
@@ -382,7 +383,7 @@ def apply(
         w2_scale = [layer.w2_weight_scale]
         # TODO w4a8 scene: dynamic acquisition of dtype in the future
         _output_dtype = torch.bfloat16
-        
+
         hidden_states = torch.ops.npu.npu_grouped_matmul(
             x=[sorted_hidden_states],
             weight=[layer.w13_weight],
@@ -417,7 +418,8 @@ def apply(
         final_hidden_states = torch.ops.npu.npu_moe_token_unpermute(
             permuted_tokens=output,
             sorted_indices=torch.abs(expanded_row_idx),
-            probs=topk_weights)
+            probs=topk_weights,
+        )
         if len(original_shape) == 3:
             final_hidden_states = final_hidden_states.view(original_shape)
 
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
index 6481b4f79bf4..681d45d18f0b 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
@@ -1,13 +1,8 @@
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
 from sglang.srt.hardware_backend.npu.utils import npu_format_cast
-from sglang.srt.layers.parameter import (
-    ChannelQuantScaleParameter,
-    ModelWeightParameter,
-    PerTensorScaleParameter,
-)
 from sglang.srt.layers.quantization.base_config import LinearMethodBase
 
 if TYPE_CHECKING:
@@ -25,7 +20,7 @@ def __init__(
 
 
 class NPUW8A8Int8LinearMethod(_NPULinearMethodBase):
-    
+
     @staticmethod
     def apply(
         layer: torch.nn.Module,
@@ -139,4 +134,4 @@ def process_weights_after_loading(layer):
         layer.weight_offset.data = layer.weight_offset.data.flatten()
         layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack(
             layer.weight.data.to(torch.int32)
-        )
\ No newline at end of file
+        )
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index 864bc91cc838..4e10c5d734eb 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -32,10 +32,10 @@
     CompressedTensorsScheme,
     CompressedTensorsW4A4Fp4,
     CompressedTensorsW8A8Fp8,
-    GPUCompressedTensorsW8A8Int8,
-    NPUCompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8,
     CompressedTensorsWNA16,
+    GPUCompressedTensorsW8A8Int8,
+    NPUCompressedTensorsW8A8Int8,
 )
 from sglang.srt.layers.quantization.compressed_tensors.utils import (
     find_matched_target,
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index fb12922df3be..64401aea6a71 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -7,6 +7,10 @@
 from compressed_tensors.quantization import QuantizationStrategy
 from torch.nn import Parameter
 
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+    NPUW8A8Int8DynamicLinearMethod,
+    NPUW8A8Int8LinearMethod,
+)
 from sglang.srt.layers.parameter import (
     ChannelQuantScaleParameter,
     ModelWeightParameter,
@@ -15,10 +19,6 @@
 from sglang.srt.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme,
 )
-from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
-    NPUW8A8Int8DynamicLinearMethod,
-    NPUW8A8Int8LinearMethod
-)
 from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
 from sglang.srt.layers.quantization.utils import requantize_with_max_scale
 from sglang.srt.utils import is_cuda
@@ -94,15 +94,11 @@ def create_weights(
 
 
 class GPUCompressedTensorsW8A8Int8(CompressedTensorsScheme):
-    
+
     def __init__(
         self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
     ):
-        super.__init__(
-            strategy,
-            is_static_input_scheme,
-            input_symmetric
-        )
+        super.__init__(strategy, is_static_input_scheme, input_symmetric)
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -190,15 +186,11 @@ def apply_weights(
 
 
 class NPUCompressedTensorsW8A8Int8(CompressedTensorsScheme):
-    
+
     def __init__(
         self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
     ):
-        super.__init__(
-            strategy,
-            is_static_input_scheme,
-            input_symmetric
-        )
+        super.__init__(strategy, is_static_input_scheme, input_symmetric)
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -209,10 +201,9 @@ def process_weights_after_loading(self, layer):
             return NPUW8A8Int8LinearMethod.process_weights_after_loading(layer)
         else:
             return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
-    
+
     def apply_weights(self, layer, x, bias):
         if self.is_static_input_scheme:
             return NPUW8A8Int8LinearMethod.apply(layer)
         else:
             return NPUW8A8Int8DynamicLinearMethod.apply(layer)
-
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index 4de4d04ec6b1..7825b3fd2027 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -5,42 +5,36 @@
 from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
 
 import torch
-from compressed_tensors.quantization import QuantizationStrategy
-from pydantic import BaseModel
 
 # from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
 #     NPUW4A8Int4DynamicMoEMethod,
 #     NPUW4A16Int4DynamicMoEMethod,
 #     NPUW8A8Int8DynamicMoEMethod,
 # )
-from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
-    _NPULinearMethodBase
-    # NPUW8A8Int8DynamicLinearMethod,
-    # NPUW8A8Int8LinearMethod,
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (  # NPUW8A8Int8DynamicLinearMethod,; NPUW8A8Int8LinearMethod,
+    _NPULinearMethodBase,
 )
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
 from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import (
     ModelSlimMoEMethod,
 )
 from sglang.srt.layers.quantization.msmodelslim.schemes import (
     ModelSlimScheme,
-    ModelSlimW8A8Int8,
     ModelSlimW4A4Int4,
+    ModelSlimW8A8Int8,
 )
-from sglang.srt.layers.quantization.compressed_tensors.utils import (
-    find_matched_target,
-    is_activation_quantization_format,
-    should_ignore_layer
-)
-#from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
+
+# from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
 from sglang.srt.utils import apply_module_patch
 
 logger = logging.getLogger(__name__)
 
+
 # func refers to RMSNorm.__init__
 def npu_wrapper_rmsnorm_init(func):
     def init(self, hidden_size: int, **extra_args) -> None:
@@ -51,6 +45,7 @@ def init(self, hidden_size: int, **extra_args) -> None:
 
     return init
 
+
 # func refers to RMSNorm.forward_oot
 def npu_wrapper_rmsnorm_forward(func):
     def _rmsnorm_forward_oot(
@@ -122,7 +117,7 @@ def __init__(self, quant_config: Dict[str, Any] = {}):
                     "forward_npu",
                     [npu_wrapper_rmsnorm_forward],
                 )
-    
+
     def get_linear_method(self) -> ModelSlimLinearMethod:
         return ModelSlimLinearMethod(self)
 
@@ -183,29 +178,28 @@ def get_quant_method(
                 return UnquantizedLinearMethod()
             scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config)
             if scheme is None:
-                raise NotImplementedError("At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes.")
+                raise NotImplementedError(
+                    "At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes."
+                )
             layer.scheme = scheme
-            return (
-                ModelSlimLinearMethod(self)
-            )
+            return ModelSlimLinearMethod(self)
         elif isinstance(layer, FusedMoE):
             return ModelSlimMoEMethod.get_moe_method(self, layer, prefix)
         return None
 
     def _get_scheme_from_parts(
-            self, layer_name: str,
-        ) -> ModelSlimScheme:
+        self,
+        layer_name: str,
+    ) -> ModelSlimScheme:
 
-        quant_type = self.quant_description[layer_name + '.weight']
+        quant_type = self.quant_description[layer_name + ".weight"]
         if quant_type == "W8A8_DYNAMIC" or quant_type == "W8A8":
             return ModelSlimW8A8Int8(
-                quant_config=self.quant_description,
-                prefix=layer_name
+                quant_config=self.quant_description, prefix=layer_name
             )
         elif quant_type == "W4A4_DYNAMIC":
             return ModelSlimW4A4Int4(
-                quant_config=self.quant_description,
-                prefix=layer_name
+                quant_config=self.quant_description, prefix=layer_name
             )
 
             # Detect If Mixed Precision
@@ -225,66 +219,66 @@ def _get_scheme_from_parts(
             #             "Other method (CompressedTensorsW4A16Sparse24) is not supported now"
             #         )
 
-            #if is_activation_quantization_format(self.quant_format):
-                # if self._is_fp8_w8a8(weight_quant, input_quant):
-                #     is_fp8_w8a8_supported = self._check_scheme_supported(
-                #         CompressedTensorsW8A8Fp8.get_min_capability(), error=False
-                #     )
-                #     if is_fp8_w8a8_supported:
-                #         return CompressedTensorsW8A8Fp8(
-                #             strategy=weight_quant.strategy,
-                #             is_static_input_scheme=(
-                #                 input_quant and not input_quant.dynamic
-                #             ),
-                #         )
-                #     else:
-                #         # note: input_quant will be present for converted models;
-                #         # will be ignored during inference post loading
-                #         return CompressedTensorsW8A16Fp8(
-                #             strategy=weight_quant.strategy,
-                #             is_static_input_scheme=not input_quant.dynamic,
-                #         )
-
-                # # note: input_quant can be None
-                # if self._is_fp8_w8a16(weight_quant, input_quant):
-                #     is_static_input_scheme = input_quant and not input_quant.dynamic
-                #     return CompressedTensorsW8A16Fp8(
-                #         strategy=weight_quant.strategy,
-                #         is_static_input_scheme=is_static_input_scheme,
-                #     )
-
-            #raise NotImplementedError("No msmodelslim compatible scheme was found.")
-    
-    def get_scheme(
-            self, layer: torch.nn.Module, layer_name: Optional[str] = None
-        ) -> Optional[ModelSlimScheme]:
-            """
-            get_scheme method adjusted for modelslim, taken from
-            python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
-            """
-            # if self.target_scheme_map:
-            #     matched_target = find_matched_target(
-            #         layer_name=layer_name,
-            #         module=layer,
-            #         targets=self.target_scheme_map.keys(),
-            #         fused_mapping=self.packed_modules_mapping,
+            # if is_activation_quantization_format(self.quant_format):
+            # if self._is_fp8_w8a8(weight_quant, input_quant):
+            #     is_fp8_w8a8_supported = self._check_scheme_supported(
+            #         CompressedTensorsW8A8Fp8.get_min_capability(), error=False
             #     )
+            #     if is_fp8_w8a8_supported:
+            #         return CompressedTensorsW8A8Fp8(
+            #             strategy=weight_quant.strategy,
+            #             is_static_input_scheme=(
+            #                 input_quant and not input_quant.dynamic
+            #             ),
+            #         )
+            #     else:
+            #         # note: input_quant will be present for converted models;
+            #         # will be ignored during inference post loading
+            #         return CompressedTensorsW8A16Fp8(
+            #             strategy=weight_quant.strategy,
+            #             is_static_input_scheme=not input_quant.dynamic,
+            #         )
 
-            #     scheme_dict = self.target_scheme_map[matched_target]
-            #     weight_quant = scheme_dict.get("weights")
-            #     input_quant = scheme_dict.get("input_activations")
-            # else:
-                # Find the quant_scheme
-            scheme = self._get_scheme_from_parts(  # type: ignore
-                # weight_quant=weight_quant,
-                # input_quant=input_quant,
-                layer_name=layer_name,
-            )
+            # # note: input_quant can be None
+            # if self._is_fp8_w8a16(weight_quant, input_quant):
+            #     is_static_input_scheme = input_quant and not input_quant.dynamic
+            #     return CompressedTensorsW8A16Fp8(
+            #         strategy=weight_quant.strategy,
+            #         is_static_input_scheme=is_static_input_scheme,
+            #     )
+
+            # raise NotImplementedError("No msmodelslim compatible scheme was found.")
+
+    def get_scheme(
+        self, layer: torch.nn.Module, layer_name: Optional[str] = None
+    ) -> Optional[ModelSlimScheme]:
+        """
+        get_scheme method adjusted for modelslim, taken from
+        python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+        """
+        # if self.target_scheme_map:
+        #     matched_target = find_matched_target(
+        #         layer_name=layer_name,
+        #         module=layer,
+        #         targets=self.target_scheme_map.keys(),
+        #         fused_mapping=self.packed_modules_mapping,
+        #     )
+
+        #     scheme_dict = self.target_scheme_map[matched_target]
+        #     weight_quant = scheme_dict.get("weights")
+        #     input_quant = scheme_dict.get("input_activations")
+        # else:
+        # Find the quant_scheme
+        scheme = self._get_scheme_from_parts(  # type: ignore
+            # weight_quant=weight_quant,
+            # input_quant=input_quant,
+            layer_name=layer_name,
+        )
 
-            # Ascend doesn't support device capability
-            # self._check_scheme_supported(scheme.get_min_capability())
-            logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
-            return scheme
+        # Ascend doesn't support device capability
+        # self._check_scheme_supported(scheme.get_min_capability())
+        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
+        return scheme
 
     def is_layer_skipped(
         self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index fe83dc771117..4b7b596c4f8d 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -2,25 +2,16 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import enum
 import logging
-from enum import Enum
-from typing import Callable, Optional, TYPE_CHECKING
-from typing import Any, Dict, List
+from typing import TYPE_CHECKING, Any, Dict
 
 import torch
 
-from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
-from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
-from sglang.srt.layers.quantization.msmodelslim.schemes import (
-    ModelSlimScheme,
-)
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
     NPUW4A8Int8DynamicMoEMethod,
     NPUW8A8Int8DynamicMoEMethod,
 )
-
+from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
 from sglang.srt.utils import set_weight_attrs
 
 if TYPE_CHECKING:
@@ -29,9 +20,7 @@
         CombineInput,
         StandardDispatchOutput,
     )
-    from sglang.srt.layers.quantization.msmodelslim.msmodelslim import (
-        ModelSlimConfig,
-    )
+    from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig
 
 logger = logging.getLogger(__name__)
 
@@ -73,7 +62,9 @@ def get_moe_method(
 class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod):
 
     def __init__(
-            self, quant_config: Dict[str, Any], prefix: str = None,
+        self,
+        quant_config: Dict[str, Any],
+        prefix: str = None,
     ):
         self.quant_config = quant_config
         self.group_size = 0
@@ -173,7 +164,9 @@ def create_weights(
                 ),
                 requires_grad=False,
             )
-            layer.register_parameter("w13_weight_offset_second", w13_weight_offset_second)
+            layer.register_parameter(
+                "w13_weight_offset_second", w13_weight_offset_second
+            )
             set_weight_attrs(w13_weight_offset_second, extra_weight_attrs)
 
             w2_weight_scale_second = torch.nn.Parameter(
@@ -225,14 +218,13 @@ def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
     ):
         self.moe_runner_config = moe_runner_config
-    
+
     def apply(
         self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
         return NPUW4A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
-    
 
     def apply_without_routing_weights(
         self,
@@ -243,18 +235,22 @@ def apply_without_routing_weights(
         group_list,
         output_dtype,
     ):
-        return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(layer,
-                                                                        hidden_states,
-                                                                        hidden_states_scale,
-                                                                        group_list_type,
-                                                                        group_list,
-                                                                        output_dtype,)
+        return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(
+            layer,
+            hidden_states,
+            hidden_states_scale,
+            group_list_type,
+            group_list,
+            output_dtype,
+        )
 
 
 class ModelSlimW8A8Int8MoE(ModelSlimMoEMethod):
 
     def __init__(
-            self, quant_config: Dict[str, Any], prefix: str = None,
+        self,
+        quant_config: Dict[str, Any],
+        prefix: str = None,
     ):
         self.quant_config = quant_config
 
@@ -335,14 +331,13 @@ def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
     ):
         self.moe_runner_config = moe_runner_config
-    
+
     def apply(
         self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
         return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
-    
 
     def apply_without_routing_weights(
         self,
@@ -353,9 +348,11 @@ def apply_without_routing_weights(
         group_list,
         output_dtype,
     ):
-        return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(layer,
-                                                                        hidden_states,
-                                                                        hidden_states_scale,
-                                                                        group_list_type,
-                                                                        group_list,
-                                                                        output_dtype,)
+        return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(
+            layer,
+            hidden_states,
+            hidden_states_scale,
+            group_list_type,
+            group_list,
+            output_dtype,
+        )
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
index 997892772977..fba516eed7c0 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from .msmodelslim_scheme import ModelSlimScheme
-from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8
 from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4
+from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8
 
 __all__ = [
     "ModelSlimScheme",
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
index 3bbbf4af1f2d..1b578837c8d4 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
@@ -1,40 +1,28 @@
 # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Callable, Optional
+from typing import Any, Dict, List, Optional
 
 import torch
-from torch.nn import Parameter
 
-from typing import Any, Dict, List
-
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
     NPU_W4A4DynamicLinearMethod,
 )
-from sglang.srt.layers.parameter import (
-    ChannelQuantScaleParameter,
-    ModelWeightParameter,
-    PerTensorScaleParameter,
-)
-from sglang.srt.layers.quantization.msmodelslim.schemes import (
-    ModelSlimScheme,
-)
-
+from sglang.srt.layers.parameter import PerTensorScaleParameter
+from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme
 from sglang.srt.utils import set_weight_attrs
 
 
 class ModelSlimW4A4Int4(ModelSlimScheme):
 
     def __init__(
-        self, quant_config: Dict[str, any], prefix: str,
+        self,
+        quant_config: Dict[str, any],
+        prefix: str,
     ):
         self.quant_config = quant_config
-        self.is_dynamic = (
-            self.quant_config[prefix + ".weight"]
-            == "W4A4_DYNAMIC"
-        )
-        
+        self.is_dynamic = self.quant_config[prefix + ".weight"] == "W4A4_DYNAMIC"
+
     @staticmethod
     def get_weight(
         input_size: int, output_size: int, params_dtype: torch.dtype
@@ -64,8 +52,12 @@ def create_weights(
     ) -> None:
         output_size_per_partition = sum(output_partition_sizes)
         weight_loader = extra_weight_attrs.get("weight_loader")
-        
-        weight_dict = {"weight": torch.empty(output_size_per_partition, input_size_per_partition, dtype=torch.int8)}
+
+        weight_dict = {
+            "weight": torch.empty(
+                output_size_per_partition, input_size_per_partition, dtype=torch.int8
+            )
+        }
         for weight_name, weight_param in weight_dict.items():
             param = torch.nn.Parameter(weight_param, requires_grad=False)
             set_weight_attrs(param, {"input_dim": 1, "output_dim": 0})
@@ -80,10 +72,14 @@ def create_weights(
             # disable warning
             param.ignore_warning = True
             layer.register_parameter(pertensor_name, param)
-        
+
         perchannel_dict = {}
-        perchannel_dict["weight_scale"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype)
-        perchannel_dict["weight_offset"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype)
+        perchannel_dict["weight_scale"] = torch.empty(
+            output_size_per_partition, 1, dtype=params_dtype
+        )
+        perchannel_dict["weight_offset"] = torch.empty(
+            output_size_per_partition, 1, dtype=params_dtype
+        )
         for perchannel_name, perchannel_param in perchannel_dict.items():
             param = torch.nn.Parameter(perchannel_param, requires_grad=False)
             set_weight_attrs(param, {"output_dim": 0})
@@ -92,7 +88,7 @@ def create_weights(
 
     def process_weights_after_loading(self, layer):
         NPU_W4A4DynamicLinearMethod.process_weights_after_loading(layer)
-    
+
     def apply_weights(
         self,
         layer: torch.nn.Module,
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
index b33764b858a9..de99c9fed0b7 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
@@ -1,38 +1,31 @@
 # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Callable, Optional
+from typing import Dict, List, Optional
 
 import torch
-from torch.nn import Parameter
 
-from typing import Any, Dict, List
-
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
     NPUW8A8Int8DynamicLinearMethod,
-    NPUW8A8Int8LinearMethod
+    NPUW8A8Int8LinearMethod,
 )
 from sglang.srt.layers.parameter import (
     ChannelQuantScaleParameter,
     ModelWeightParameter,
     PerTensorScaleParameter,
 )
-from sglang.srt.layers.quantization.msmodelslim.schemes import (
-    ModelSlimScheme,
-)
+from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme
 
 
 class ModelSlimW8A8Int8(ModelSlimScheme):
 
     def __init__(
-        self, quant_config: Dict[str, any], prefix: str,
+        self,
+        quant_config: Dict[str, any],
+        prefix: str,
     ):
         self.quant_config = quant_config
-        self.is_dynamic = (
-            self.quant_config[prefix + ".weight"]
-            == "W8A8_DYNAMIC"
-        )
+        self.is_dynamic = self.quant_config[prefix + ".weight"] == "W8A8_DYNAMIC"
 
     def create_weights(
         self,
@@ -70,7 +63,7 @@ def create_weights(
             weight_loader=weight_loader,
         )
         layer.register_parameter("weight_offset", weight_offset)
-        
+
         if not self.is_dynamic:
             input_scale = PerTensorScaleParameter(
                 data=torch.empty(1, dtype=params_dtype),
@@ -111,7 +104,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module):
             NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
         else:
             NPUW8A8Int8LinearMethod.process_weights_after_loading(layer)
-    
+
     def apply_weights(
         self,
         layer: torch.nn.Module,
@@ -121,4 +114,4 @@ def apply_weights(
         if self.is_dynamic:
             return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias)
         else:
-            return NPUW8A8Int8LinearMethod.apply(layer, x, bias)
\ No newline at end of file
+            return NPUW8A8Int8LinearMethod.apply(layer, x, bias)
diff --git a/test/manual/ascend/test_ascend_w8a8_quantization.py b/test/manual/ascend/test_ascend_w8a8_quantization.py
index 5c1bc3b66562..959bf88a513f 100644
--- a/test/manual/ascend/test_ascend_w8a8_quantization.py
+++ b/test/manual/ascend/test_ascend_w8a8_quantization.py
@@ -98,12 +98,12 @@ def test_throughput(self):
 
         if is_in_ci():
             self.assertGreaterEqual(throughput, 25)
-            
+
 
 class TestAscendW8A8CompressedTensors(CustomTestCase):
     @classmethod
     def setUpClass(cls):
-        #TODO: Move model to CI or Modelscope
+        # TODO: Move model to CI or Modelscope
         cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8"
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.process = popen_launch_server(

From 39179198f4be81f3ea19a808f8def4e7f01220d0 Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Wed, 17 Dec 2025 13:29:34 +0300
Subject: [PATCH 035/175] Revert "Pre-commit fixes"

This reverts commit 77a923e4ea63dc6b55c49a5fdbba9ac1fc777fe3.
---
 python/sglang/srt/configs/model_config.py     |  17 +-
 .../npu/quantization/fused_moe_method_npu.py  |  20 +--
 .../npu/quantization/linear_method_npu.py     |  11 +-
 .../compressed_tensors/compressed_tensors.py  |   4 +-
 .../schemes/compressed_tensors_w8a8_int8.py   |  27 ++-
 .../quantization/msmodelslim/msmodelslim.py   | 158 +++++++++---------
 .../msmodelslim/msmodelslim_moe.py            |  63 +++----
 .../msmodelslim/schemes/__init__.py           |   2 +-
 .../schemes/msmodelslim_w4a4_int4.py          |  48 +++---
 .../schemes/msmodelslim_w8a8_int8.py          |  27 +--
 .../ascend/test_ascend_w8a8_quantization.py   |   4 +-
 11 files changed, 206 insertions(+), 175 deletions(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 8e7e98a77d0b..a4253b9f59ce 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -16,8 +16,8 @@
 import logging
 import math
 import os
-from enum import Enum, IntEnum, auto
 from pathlib import Path
+from enum import Enum, IntEnum, auto
 from typing import Any, List, Optional, Set, Union
 
 import torch
@@ -26,7 +26,7 @@
 from sglang.srt.environ import envs
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import is_hip, is_npu, retry
+from sglang.srt.utils import is_hip, retry, is_npu
 from sglang.srt.utils.hf_transformers_utils import (
     get_config,
     get_context_length,
@@ -39,7 +39,6 @@
 logger = logging.getLogger(__name__)
 _is_npu = is_npu()
 
-
 class AttentionArch(IntEnum):
     MLA = auto()
     MHA = auto()
@@ -597,12 +596,12 @@ def _parse_quant_hf_config(self):
         return quant_cfg
 
     def _find_quant_modelslim_config(self):
-        quant_config_file = Path(self.model_path, "quant_model_description.json")
-        quant_cfg = None
-        if quant_config_file.is_file():
-            with open(quant_config_file) as f:
+        quant_config_file = Path(self.model_path, "quant_model_description.json")   
+        quant_cfg = None 
+        if quant_config_file.is_file(): 
+            with open(quant_config_file) as f: 
                 quant_cfg = json.load(f)
-
+                            
         return quant_cfg
 
     def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]:
@@ -725,7 +724,7 @@ def _verify_quantization(self) -> None:
         quant_cfg = self._parse_quant_hf_config()
         if _is_npu:
             quant_cfg = self._find_quant_modelslim_config()
-            self.quantization = "modelslim"
+            self.quantization = 'modelslim'
 
         if quant_cfg is not None:
             quant_method = quant_cfg.get(
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index dfbab790d1ed..54622a1e0873 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -199,7 +199,7 @@ def apply(
             top_k=topk_ids.shape[1],
         )
         return StandardCombineInput(hidden_states=output)
-
+    
     @staticmethod
     def apply_without_routing_weights(
         layer,
@@ -253,11 +253,12 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase):
     @classmethod
     def process_scale(cls, weight: torch.Tensor, scale, per_group_scale):
         scale = scale.transpose(1, 2).contiguous()
-        # if cls.is_per_channel_weight:
+        #if cls.is_per_channel_weight:
         if True:
             scale_np = scale.cpu().numpy()
             scale_np.dtype = np.uint32
-            scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu()
+            scale_uint64_tensor = torch.from_numpy(scale_np.astype(
+                np.int64)).npu()
             return scale_uint64_tensor, None
         per_group_scale = per_group_scale.transpose(1, 2).contiguous()
         group_num, k, n = weight.shape
@@ -340,7 +341,6 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
         layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data)
 
     staticmethod
-
     def apply(
         layer,
         dispatch_output: "StandardDispatchOutput",
@@ -351,11 +351,11 @@ def apply(
         topk_output = dispatch_output.topk_output
 
         topk_weights, topk_ids, _ = topk_output
-        top_k = topk_ids.shape[1]
+        top_k=topk_ids.shape[1]
         group_list_type = 1
         original_shape = hidden_states.shape
         topk_weights = topk_weights
-
+        
         num_tokens = hidden_states.shape[:-1].numel()
 
         first_expert_idx = 0
@@ -372,8 +372,7 @@ def apply(
                 expert_tokens_num_flag=True,
                 active_expert_range=[first_expert_idx, last_expert_idx],
                 quant_mode=1,
-            )
-        )
+            ))
 
         expert_tokens = expert_tokens.to(torch.int64)
 
@@ -383,7 +382,7 @@ def apply(
         w2_scale = [layer.w2_weight_scale]
         # TODO w4a8 scene: dynamic acquisition of dtype in the future
         _output_dtype = torch.bfloat16
-
+        
         hidden_states = torch.ops.npu.npu_grouped_matmul(
             x=[sorted_hidden_states],
             weight=[layer.w13_weight],
@@ -418,8 +417,7 @@ def apply(
         final_hidden_states = torch.ops.npu.npu_moe_token_unpermute(
             permuted_tokens=output,
             sorted_indices=torch.abs(expanded_row_idx),
-            probs=topk_weights,
-        )
+            probs=topk_weights)
         if len(original_shape) == 3:
             final_hidden_states = final_hidden_states.view(original_shape)
 
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
index 681d45d18f0b..6481b4f79bf4 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
@@ -1,8 +1,13 @@
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 import torch
 
 from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
 from sglang.srt.layers.quantization.base_config import LinearMethodBase
 
 if TYPE_CHECKING:
@@ -20,7 +25,7 @@ def __init__(
 
 
 class NPUW8A8Int8LinearMethod(_NPULinearMethodBase):
-
+    
     @staticmethod
     def apply(
         layer: torch.nn.Module,
@@ -134,4 +139,4 @@ def process_weights_after_loading(layer):
         layer.weight_offset.data = layer.weight_offset.data.flatten()
         layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack(
             layer.weight.data.to(torch.int32)
-        )
+        )
\ No newline at end of file
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index 4e10c5d734eb..864bc91cc838 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -32,10 +32,10 @@
     CompressedTensorsScheme,
     CompressedTensorsW4A4Fp4,
     CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16,
     GPUCompressedTensorsW8A8Int8,
     NPUCompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16,
 )
 from sglang.srt.layers.quantization.compressed_tensors.utils import (
     find_matched_target,
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 64401aea6a71..fb12922df3be 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -7,10 +7,6 @@
 from compressed_tensors.quantization import QuantizationStrategy
 from torch.nn import Parameter
 
-from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
-    NPUW8A8Int8DynamicLinearMethod,
-    NPUW8A8Int8LinearMethod,
-)
 from sglang.srt.layers.parameter import (
     ChannelQuantScaleParameter,
     ModelWeightParameter,
@@ -19,6 +15,10 @@
 from sglang.srt.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme,
 )
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+    NPUW8A8Int8DynamicLinearMethod,
+    NPUW8A8Int8LinearMethod
+)
 from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
 from sglang.srt.layers.quantization.utils import requantize_with_max_scale
 from sglang.srt.utils import is_cuda
@@ -94,11 +94,15 @@ def create_weights(
 
 
 class GPUCompressedTensorsW8A8Int8(CompressedTensorsScheme):
-
+    
     def __init__(
         self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
     ):
-        super.__init__(strategy, is_static_input_scheme, input_symmetric)
+        super.__init__(
+            strategy,
+            is_static_input_scheme,
+            input_symmetric
+        )
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -186,11 +190,15 @@ def apply_weights(
 
 
 class NPUCompressedTensorsW8A8Int8(CompressedTensorsScheme):
-
+    
     def __init__(
         self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
     ):
-        super.__init__(strategy, is_static_input_scheme, input_symmetric)
+        super.__init__(
+            strategy,
+            is_static_input_scheme,
+            input_symmetric
+        )
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -201,9 +209,10 @@ def process_weights_after_loading(self, layer):
             return NPUW8A8Int8LinearMethod.process_weights_after_loading(layer)
         else:
             return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
-
+    
     def apply_weights(self, layer, x, bias):
         if self.is_static_input_scheme:
             return NPUW8A8Int8LinearMethod.apply(layer)
         else:
             return NPUW8A8Int8DynamicLinearMethod.apply(layer)
+
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index 7825b3fd2027..4de4d04ec6b1 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -5,36 +5,42 @@
 from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
 
 import torch
+from compressed_tensors.quantization import QuantizationStrategy
+from pydantic import BaseModel
 
 # from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
 #     NPUW4A8Int4DynamicMoEMethod,
 #     NPUW4A16Int4DynamicMoEMethod,
 #     NPUW8A8Int8DynamicMoEMethod,
 # )
-from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (  # NPUW8A8Int8DynamicLinearMethod,; NPUW8A8Int8LinearMethod,
-    _NPULinearMethodBase,
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+    _NPULinearMethodBase
+    # NPUW8A8Int8DynamicLinearMethod,
+    # NPUW8A8Int8LinearMethod,
 )
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
 from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import (
     ModelSlimMoEMethod,
 )
 from sglang.srt.layers.quantization.msmodelslim.schemes import (
     ModelSlimScheme,
-    ModelSlimW4A4Int4,
     ModelSlimW8A8Int8,
+    ModelSlimW4A4Int4,
 )
-
-# from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
+from sglang.srt.layers.quantization.compressed_tensors.utils import (
+    find_matched_target,
+    is_activation_quantization_format,
+    should_ignore_layer
+)
+#from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
 from sglang.srt.utils import apply_module_patch
 
 logger = logging.getLogger(__name__)
 
-
 # func refers to RMSNorm.__init__
 def npu_wrapper_rmsnorm_init(func):
     def init(self, hidden_size: int, **extra_args) -> None:
@@ -45,7 +51,6 @@ def init(self, hidden_size: int, **extra_args) -> None:
 
     return init
 
-
 # func refers to RMSNorm.forward_oot
 def npu_wrapper_rmsnorm_forward(func):
     def _rmsnorm_forward_oot(
@@ -117,7 +122,7 @@ def __init__(self, quant_config: Dict[str, Any] = {}):
                     "forward_npu",
                     [npu_wrapper_rmsnorm_forward],
                 )
-
+    
     def get_linear_method(self) -> ModelSlimLinearMethod:
         return ModelSlimLinearMethod(self)
 
@@ -178,28 +183,29 @@ def get_quant_method(
                 return UnquantizedLinearMethod()
             scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config)
             if scheme is None:
-                raise NotImplementedError(
-                    "At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes."
-                )
+                raise NotImplementedError("At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes.")
             layer.scheme = scheme
-            return ModelSlimLinearMethod(self)
+            return (
+                ModelSlimLinearMethod(self)
+            )
         elif isinstance(layer, FusedMoE):
             return ModelSlimMoEMethod.get_moe_method(self, layer, prefix)
         return None
 
     def _get_scheme_from_parts(
-        self,
-        layer_name: str,
-    ) -> ModelSlimScheme:
+            self, layer_name: str,
+        ) -> ModelSlimScheme:
 
-        quant_type = self.quant_description[layer_name + ".weight"]
+        quant_type = self.quant_description[layer_name + '.weight']
         if quant_type == "W8A8_DYNAMIC" or quant_type == "W8A8":
             return ModelSlimW8A8Int8(
-                quant_config=self.quant_description, prefix=layer_name
+                quant_config=self.quant_description,
+                prefix=layer_name
             )
         elif quant_type == "W4A4_DYNAMIC":
             return ModelSlimW4A4Int4(
-                quant_config=self.quant_description, prefix=layer_name
+                quant_config=self.quant_description,
+                prefix=layer_name
             )
 
             # Detect If Mixed Precision
@@ -219,66 +225,66 @@ def _get_scheme_from_parts(
             #             "Other method (CompressedTensorsW4A16Sparse24) is not supported now"
             #         )
 
-            # if is_activation_quantization_format(self.quant_format):
-            # if self._is_fp8_w8a8(weight_quant, input_quant):
-            #     is_fp8_w8a8_supported = self._check_scheme_supported(
-            #         CompressedTensorsW8A8Fp8.get_min_capability(), error=False
-            #     )
-            #     if is_fp8_w8a8_supported:
-            #         return CompressedTensorsW8A8Fp8(
-            #             strategy=weight_quant.strategy,
-            #             is_static_input_scheme=(
-            #                 input_quant and not input_quant.dynamic
-            #             ),
-            #         )
-            #     else:
-            #         # note: input_quant will be present for converted models;
-            #         # will be ignored during inference post loading
-            #         return CompressedTensorsW8A16Fp8(
-            #             strategy=weight_quant.strategy,
-            #             is_static_input_scheme=not input_quant.dynamic,
-            #         )
-
-            # # note: input_quant can be None
-            # if self._is_fp8_w8a16(weight_quant, input_quant):
-            #     is_static_input_scheme = input_quant and not input_quant.dynamic
-            #     return CompressedTensorsW8A16Fp8(
-            #         strategy=weight_quant.strategy,
-            #         is_static_input_scheme=is_static_input_scheme,
-            #     )
-
-            # raise NotImplementedError("No msmodelslim compatible scheme was found.")
-
+            #if is_activation_quantization_format(self.quant_format):
+                # if self._is_fp8_w8a8(weight_quant, input_quant):
+                #     is_fp8_w8a8_supported = self._check_scheme_supported(
+                #         CompressedTensorsW8A8Fp8.get_min_capability(), error=False
+                #     )
+                #     if is_fp8_w8a8_supported:
+                #         return CompressedTensorsW8A8Fp8(
+                #             strategy=weight_quant.strategy,
+                #             is_static_input_scheme=(
+                #                 input_quant and not input_quant.dynamic
+                #             ),
+                #         )
+                #     else:
+                #         # note: input_quant will be present for converted models;
+                #         # will be ignored during inference post loading
+                #         return CompressedTensorsW8A16Fp8(
+                #             strategy=weight_quant.strategy,
+                #             is_static_input_scheme=not input_quant.dynamic,
+                #         )
+
+                # # note: input_quant can be None
+                # if self._is_fp8_w8a16(weight_quant, input_quant):
+                #     is_static_input_scheme = input_quant and not input_quant.dynamic
+                #     return CompressedTensorsW8A16Fp8(
+                #         strategy=weight_quant.strategy,
+                #         is_static_input_scheme=is_static_input_scheme,
+                #     )
+
+            #raise NotImplementedError("No msmodelslim compatible scheme was found.")
+    
     def get_scheme(
-        self, layer: torch.nn.Module, layer_name: Optional[str] = None
-    ) -> Optional[ModelSlimScheme]:
-        """
-        get_scheme method adjusted for modelslim, taken from
-        python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
-        """
-        # if self.target_scheme_map:
-        #     matched_target = find_matched_target(
-        #         layer_name=layer_name,
-        #         module=layer,
-        #         targets=self.target_scheme_map.keys(),
-        #         fused_mapping=self.packed_modules_mapping,
-        #     )
+            self, layer: torch.nn.Module, layer_name: Optional[str] = None
+        ) -> Optional[ModelSlimScheme]:
+            """
+            get_scheme method adjusted for modelslim, taken from
+            python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+            """
+            # if self.target_scheme_map:
+            #     matched_target = find_matched_target(
+            #         layer_name=layer_name,
+            #         module=layer,
+            #         targets=self.target_scheme_map.keys(),
+            #         fused_mapping=self.packed_modules_mapping,
+            #     )
 
-        #     scheme_dict = self.target_scheme_map[matched_target]
-        #     weight_quant = scheme_dict.get("weights")
-        #     input_quant = scheme_dict.get("input_activations")
-        # else:
-        # Find the quant_scheme
-        scheme = self._get_scheme_from_parts(  # type: ignore
-            # weight_quant=weight_quant,
-            # input_quant=input_quant,
-            layer_name=layer_name,
-        )
+            #     scheme_dict = self.target_scheme_map[matched_target]
+            #     weight_quant = scheme_dict.get("weights")
+            #     input_quant = scheme_dict.get("input_activations")
+            # else:
+                # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(  # type: ignore
+                # weight_quant=weight_quant,
+                # input_quant=input_quant,
+                layer_name=layer_name,
+            )
 
-        # Ascend doesn't support device capability
-        # self._check_scheme_supported(scheme.get_min_capability())
-        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
-        return scheme
+            # Ascend doesn't support device capability
+            # self._check_scheme_supported(scheme.get_min_capability())
+            logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
+            return scheme
 
     def is_layer_skipped(
         self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 4b7b596c4f8d..fe83dc771117 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -2,16 +2,25 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import enum
 import logging
-from typing import TYPE_CHECKING, Any, Dict
+from enum import Enum
+from typing import Callable, Optional, TYPE_CHECKING
+from typing import Any, Dict, List
 
 import torch
 
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+from sglang.srt.layers.quantization.msmodelslim.schemes import (
+    ModelSlimScheme,
+)
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
     NPUW4A8Int8DynamicMoEMethod,
     NPUW8A8Int8DynamicMoEMethod,
 )
-from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+
 from sglang.srt.utils import set_weight_attrs
 
 if TYPE_CHECKING:
@@ -20,7 +29,9 @@
         CombineInput,
         StandardDispatchOutput,
     )
-    from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig
+    from sglang.srt.layers.quantization.msmodelslim.msmodelslim import (
+        ModelSlimConfig,
+    )
 
 logger = logging.getLogger(__name__)
 
@@ -62,9 +73,7 @@ def get_moe_method(
 class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod):
 
     def __init__(
-        self,
-        quant_config: Dict[str, Any],
-        prefix: str = None,
+            self, quant_config: Dict[str, Any], prefix: str = None,
     ):
         self.quant_config = quant_config
         self.group_size = 0
@@ -164,9 +173,7 @@ def create_weights(
                 ),
                 requires_grad=False,
             )
-            layer.register_parameter(
-                "w13_weight_offset_second", w13_weight_offset_second
-            )
+            layer.register_parameter("w13_weight_offset_second", w13_weight_offset_second)
             set_weight_attrs(w13_weight_offset_second, extra_weight_attrs)
 
             w2_weight_scale_second = torch.nn.Parameter(
@@ -218,13 +225,14 @@ def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
     ):
         self.moe_runner_config = moe_runner_config
-
+    
     def apply(
         self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
         return NPUW4A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
+    
 
     def apply_without_routing_weights(
         self,
@@ -235,22 +243,18 @@ def apply_without_routing_weights(
         group_list,
         output_dtype,
     ):
-        return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(
-            layer,
-            hidden_states,
-            hidden_states_scale,
-            group_list_type,
-            group_list,
-            output_dtype,
-        )
+        return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(layer,
+                                                                        hidden_states,
+                                                                        hidden_states_scale,
+                                                                        group_list_type,
+                                                                        group_list,
+                                                                        output_dtype,)
 
 
 class ModelSlimW8A8Int8MoE(ModelSlimMoEMethod):
 
     def __init__(
-        self,
-        quant_config: Dict[str, Any],
-        prefix: str = None,
+            self, quant_config: Dict[str, Any], prefix: str = None,
     ):
         self.quant_config = quant_config
 
@@ -331,13 +335,14 @@ def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
     ):
         self.moe_runner_config = moe_runner_config
-
+    
     def apply(
         self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
         return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
+    
 
     def apply_without_routing_weights(
         self,
@@ -348,11 +353,9 @@ def apply_without_routing_weights(
         group_list,
         output_dtype,
     ):
-        return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(
-            layer,
-            hidden_states,
-            hidden_states_scale,
-            group_list_type,
-            group_list,
-            output_dtype,
-        )
+        return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(layer,
+                                                                        hidden_states,
+                                                                        hidden_states_scale,
+                                                                        group_list_type,
+                                                                        group_list,
+                                                                        output_dtype,)
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
index fba516eed7c0..997892772977 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from .msmodelslim_scheme import ModelSlimScheme
-from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4
 from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8
+from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4
 
 __all__ = [
     "ModelSlimScheme",
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
index 1b578837c8d4..3bbbf4af1f2d 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
@@ -1,28 +1,40 @@
 # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Dict, List, Optional
+from typing import Callable, Optional
 
 import torch
+from torch.nn import Parameter
 
+from typing import Any, Dict, List
+
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
     NPU_W4A4DynamicLinearMethod,
 )
-from sglang.srt.layers.parameter import PerTensorScaleParameter
-from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.msmodelslim.schemes import (
+    ModelSlimScheme,
+)
+
 from sglang.srt.utils import set_weight_attrs
 
 
 class ModelSlimW4A4Int4(ModelSlimScheme):
 
     def __init__(
-        self,
-        quant_config: Dict[str, any],
-        prefix: str,
+        self, quant_config: Dict[str, any], prefix: str,
     ):
         self.quant_config = quant_config
-        self.is_dynamic = self.quant_config[prefix + ".weight"] == "W4A4_DYNAMIC"
-
+        self.is_dynamic = (
+            self.quant_config[prefix + ".weight"]
+            == "W4A4_DYNAMIC"
+        )
+        
     @staticmethod
     def get_weight(
         input_size: int, output_size: int, params_dtype: torch.dtype
@@ -52,12 +64,8 @@ def create_weights(
     ) -> None:
         output_size_per_partition = sum(output_partition_sizes)
         weight_loader = extra_weight_attrs.get("weight_loader")
-
-        weight_dict = {
-            "weight": torch.empty(
-                output_size_per_partition, input_size_per_partition, dtype=torch.int8
-            )
-        }
+        
+        weight_dict = {"weight": torch.empty(output_size_per_partition, input_size_per_partition, dtype=torch.int8)}
         for weight_name, weight_param in weight_dict.items():
             param = torch.nn.Parameter(weight_param, requires_grad=False)
             set_weight_attrs(param, {"input_dim": 1, "output_dim": 0})
@@ -72,14 +80,10 @@ def create_weights(
             # disable warning
             param.ignore_warning = True
             layer.register_parameter(pertensor_name, param)
-
+        
         perchannel_dict = {}
-        perchannel_dict["weight_scale"] = torch.empty(
-            output_size_per_partition, 1, dtype=params_dtype
-        )
-        perchannel_dict["weight_offset"] = torch.empty(
-            output_size_per_partition, 1, dtype=params_dtype
-        )
+        perchannel_dict["weight_scale"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype)
+        perchannel_dict["weight_offset"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype)
         for perchannel_name, perchannel_param in perchannel_dict.items():
             param = torch.nn.Parameter(perchannel_param, requires_grad=False)
             set_weight_attrs(param, {"output_dim": 0})
@@ -88,7 +92,7 @@ def create_weights(
 
     def process_weights_after_loading(self, layer):
         NPU_W4A4DynamicLinearMethod.process_weights_after_loading(layer)
-
+    
     def apply_weights(
         self,
         layer: torch.nn.Module,
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
index de99c9fed0b7..b33764b858a9 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
@@ -1,31 +1,38 @@
 # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Optional
+from typing import Callable, Optional
 
 import torch
+from torch.nn import Parameter
 
+from typing import Any, Dict, List
+
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
     NPUW8A8Int8DynamicLinearMethod,
-    NPUW8A8Int8LinearMethod,
+    NPUW8A8Int8LinearMethod
 )
 from sglang.srt.layers.parameter import (
     ChannelQuantScaleParameter,
     ModelWeightParameter,
     PerTensorScaleParameter,
 )
-from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme
+from sglang.srt.layers.quantization.msmodelslim.schemes import (
+    ModelSlimScheme,
+)
 
 
 class ModelSlimW8A8Int8(ModelSlimScheme):
 
     def __init__(
-        self,
-        quant_config: Dict[str, any],
-        prefix: str,
+        self, quant_config: Dict[str, any], prefix: str,
     ):
         self.quant_config = quant_config
-        self.is_dynamic = self.quant_config[prefix + ".weight"] == "W8A8_DYNAMIC"
+        self.is_dynamic = (
+            self.quant_config[prefix + ".weight"]
+            == "W8A8_DYNAMIC"
+        )
 
     def create_weights(
         self,
@@ -63,7 +70,7 @@ def create_weights(
             weight_loader=weight_loader,
         )
         layer.register_parameter("weight_offset", weight_offset)
-
+        
         if not self.is_dynamic:
             input_scale = PerTensorScaleParameter(
                 data=torch.empty(1, dtype=params_dtype),
@@ -104,7 +111,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module):
             NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
         else:
             NPUW8A8Int8LinearMethod.process_weights_after_loading(layer)
-
+    
     def apply_weights(
         self,
         layer: torch.nn.Module,
@@ -114,4 +121,4 @@ def apply_weights(
         if self.is_dynamic:
             return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias)
         else:
-            return NPUW8A8Int8LinearMethod.apply(layer, x, bias)
+            return NPUW8A8Int8LinearMethod.apply(layer, x, bias)
\ No newline at end of file
diff --git a/test/manual/ascend/test_ascend_w8a8_quantization.py b/test/manual/ascend/test_ascend_w8a8_quantization.py
index 959bf88a513f..5c1bc3b66562 100644
--- a/test/manual/ascend/test_ascend_w8a8_quantization.py
+++ b/test/manual/ascend/test_ascend_w8a8_quantization.py
@@ -98,12 +98,12 @@ def test_throughput(self):
 
         if is_in_ci():
             self.assertGreaterEqual(throughput, 25)
-
+            
 
 class TestAscendW8A8CompressedTensors(CustomTestCase):
     @classmethod
     def setUpClass(cls):
-        # TODO: Move model to CI or Modelscope
+        #TODO: Move model to CI or Modelscope
         cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8"
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.process = popen_launch_server(

From df01a40b54829751c2e71a619989f28dd8b05fa2 Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Wed, 17 Dec 2025 13:30:34 +0300
Subject: [PATCH 036/175] Pre-commit fixes

---
 python/sglang/srt/configs/model_config.py     |  17 +-
 .../npu/quantization/fused_moe_method_npu.py  |  20 ++-
 .../npu/quantization/linear_method_npu.py     |  11 +-
 .../compressed_tensors/compressed_tensors.py  |   4 +-
 .../schemes/compressed_tensors_w8a8_int8.py   |  27 +--
 .../quantization/msmodelslim/msmodelslim.py   | 158 +++++++++---------
 .../msmodelslim/msmodelslim_moe.py            |  63 ++++---
 .../msmodelslim/schemes/__init__.py           |   2 +-
 .../schemes/msmodelslim_w4a4_int4.py          |  48 +++---
 .../schemes/msmodelslim_w8a8_int8.py          |  27 ++-
 .../ascend/test_ascend_w8a8_quantization.py   |   4 +-
 11 files changed, 175 insertions(+), 206 deletions(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index a4253b9f59ce..8e7e98a77d0b 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -16,8 +16,8 @@
 import logging
 import math
 import os
-from pathlib import Path
 from enum import Enum, IntEnum, auto
+from pathlib import Path
 from typing import Any, List, Optional, Set, Union
 
 import torch
@@ -26,7 +26,7 @@
 from sglang.srt.environ import envs
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import is_hip, retry, is_npu
+from sglang.srt.utils import is_hip, is_npu, retry
 from sglang.srt.utils.hf_transformers_utils import (
     get_config,
     get_context_length,
@@ -39,6 +39,7 @@
 logger = logging.getLogger(__name__)
 _is_npu = is_npu()
 
+
 class AttentionArch(IntEnum):
     MLA = auto()
     MHA = auto()
@@ -596,12 +597,12 @@ def _parse_quant_hf_config(self):
         return quant_cfg
 
     def _find_quant_modelslim_config(self):
-        quant_config_file = Path(self.model_path, "quant_model_description.json")   
-        quant_cfg = None 
-        if quant_config_file.is_file(): 
-            with open(quant_config_file) as f: 
+        quant_config_file = Path(self.model_path, "quant_model_description.json")
+        quant_cfg = None
+        if quant_config_file.is_file():
+            with open(quant_config_file) as f:
                 quant_cfg = json.load(f)
-                            
+
         return quant_cfg
 
     def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]:
@@ -724,7 +725,7 @@ def _verify_quantization(self) -> None:
         quant_cfg = self._parse_quant_hf_config()
         if _is_npu:
             quant_cfg = self._find_quant_modelslim_config()
-            self.quantization = 'modelslim'
+            self.quantization = "modelslim"
 
         if quant_cfg is not None:
             quant_method = quant_cfg.get(
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 54622a1e0873..dfbab790d1ed 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -199,7 +199,7 @@ def apply(
             top_k=topk_ids.shape[1],
         )
         return StandardCombineInput(hidden_states=output)
-    
+
     @staticmethod
     def apply_without_routing_weights(
         layer,
@@ -253,12 +253,11 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase):
     @classmethod
     def process_scale(cls, weight: torch.Tensor, scale, per_group_scale):
         scale = scale.transpose(1, 2).contiguous()
-        #if cls.is_per_channel_weight:
+        # if cls.is_per_channel_weight:
         if True:
             scale_np = scale.cpu().numpy()
             scale_np.dtype = np.uint32
-            scale_uint64_tensor = torch.from_numpy(scale_np.astype(
-                np.int64)).npu()
+            scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu()
             return scale_uint64_tensor, None
         per_group_scale = per_group_scale.transpose(1, 2).contiguous()
         group_num, k, n = weight.shape
@@ -341,6 +340,7 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
         layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data)
 
     staticmethod
+
     def apply(
         layer,
         dispatch_output: "StandardDispatchOutput",
@@ -351,11 +351,11 @@ def apply(
         topk_output = dispatch_output.topk_output
 
         topk_weights, topk_ids, _ = topk_output
-        top_k=topk_ids.shape[1]
+        top_k = topk_ids.shape[1]
         group_list_type = 1
         original_shape = hidden_states.shape
         topk_weights = topk_weights
-        
+
         num_tokens = hidden_states.shape[:-1].numel()
 
         first_expert_idx = 0
@@ -372,7 +372,8 @@ def apply(
                 expert_tokens_num_flag=True,
                 active_expert_range=[first_expert_idx, last_expert_idx],
                 quant_mode=1,
-            ))
+            )
+        )
 
         expert_tokens = expert_tokens.to(torch.int64)
 
@@ -382,7 +383,7 @@ def apply(
         w2_scale = [layer.w2_weight_scale]
         # TODO w4a8 scene: dynamic acquisition of dtype in the future
         _output_dtype = torch.bfloat16
-        
+
         hidden_states = torch.ops.npu.npu_grouped_matmul(
             x=[sorted_hidden_states],
             weight=[layer.w13_weight],
@@ -417,7 +418,8 @@ def apply(
         final_hidden_states = torch.ops.npu.npu_moe_token_unpermute(
             permuted_tokens=output,
             sorted_indices=torch.abs(expanded_row_idx),
-            probs=topk_weights)
+            probs=topk_weights,
+        )
         if len(original_shape) == 3:
             final_hidden_states = final_hidden_states.view(original_shape)
 
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
index 6481b4f79bf4..681d45d18f0b 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
@@ -1,13 +1,8 @@
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
 from sglang.srt.hardware_backend.npu.utils import npu_format_cast
-from sglang.srt.layers.parameter import (
-    ChannelQuantScaleParameter,
-    ModelWeightParameter,
-    PerTensorScaleParameter,
-)
 from sglang.srt.layers.quantization.base_config import LinearMethodBase
 
 if TYPE_CHECKING:
@@ -25,7 +20,7 @@ def __init__(
 
 
 class NPUW8A8Int8LinearMethod(_NPULinearMethodBase):
-    
+
     @staticmethod
     def apply(
         layer: torch.nn.Module,
@@ -139,4 +134,4 @@ def process_weights_after_loading(layer):
         layer.weight_offset.data = layer.weight_offset.data.flatten()
         layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack(
             layer.weight.data.to(torch.int32)
-        )
\ No newline at end of file
+        )
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index 864bc91cc838..4e10c5d734eb 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -32,10 +32,10 @@
     CompressedTensorsScheme,
     CompressedTensorsW4A4Fp4,
     CompressedTensorsW8A8Fp8,
-    GPUCompressedTensorsW8A8Int8,
-    NPUCompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8,
     CompressedTensorsWNA16,
+    GPUCompressedTensorsW8A8Int8,
+    NPUCompressedTensorsW8A8Int8,
 )
 from sglang.srt.layers.quantization.compressed_tensors.utils import (
     find_matched_target,
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index fb12922df3be..64401aea6a71 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -7,6 +7,10 @@
 from compressed_tensors.quantization import QuantizationStrategy
 from torch.nn import Parameter
 
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+    NPUW8A8Int8DynamicLinearMethod,
+    NPUW8A8Int8LinearMethod,
+)
 from sglang.srt.layers.parameter import (
     ChannelQuantScaleParameter,
     ModelWeightParameter,
@@ -15,10 +19,6 @@
 from sglang.srt.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme,
 )
-from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
-    NPUW8A8Int8DynamicLinearMethod,
-    NPUW8A8Int8LinearMethod
-)
 from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
 from sglang.srt.layers.quantization.utils import requantize_with_max_scale
 from sglang.srt.utils import is_cuda
@@ -94,15 +94,11 @@ def create_weights(
 
 
 class GPUCompressedTensorsW8A8Int8(CompressedTensorsScheme):
-    
+
     def __init__(
         self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
     ):
-        super.__init__(
-            strategy,
-            is_static_input_scheme,
-            input_symmetric
-        )
+        super.__init__(strategy, is_static_input_scheme, input_symmetric)
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -190,15 +186,11 @@ def apply_weights(
 
 
 class NPUCompressedTensorsW8A8Int8(CompressedTensorsScheme):
-    
+
     def __init__(
         self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
     ):
-        super.__init__(
-            strategy,
-            is_static_input_scheme,
-            input_symmetric
-        )
+        super.__init__(strategy, is_static_input_scheme, input_symmetric)
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -209,10 +201,9 @@ def process_weights_after_loading(self, layer):
             return NPUW8A8Int8LinearMethod.process_weights_after_loading(layer)
         else:
             return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
-    
+
     def apply_weights(self, layer, x, bias):
         if self.is_static_input_scheme:
             return NPUW8A8Int8LinearMethod.apply(layer)
         else:
             return NPUW8A8Int8DynamicLinearMethod.apply(layer)
-
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index 4de4d04ec6b1..7825b3fd2027 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -5,42 +5,36 @@
 from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
 
 import torch
-from compressed_tensors.quantization import QuantizationStrategy
-from pydantic import BaseModel
 
 # from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
 #     NPUW4A8Int4DynamicMoEMethod,
 #     NPUW4A16Int4DynamicMoEMethod,
 #     NPUW8A8Int8DynamicMoEMethod,
 # )
-from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
-    _NPULinearMethodBase
-    # NPUW8A8Int8DynamicLinearMethod,
-    # NPUW8A8Int8LinearMethod,
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (  # NPUW8A8Int8DynamicLinearMethod,; NPUW8A8Int8LinearMethod,
+    _NPULinearMethodBase,
 )
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
 from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import (
     ModelSlimMoEMethod,
 )
 from sglang.srt.layers.quantization.msmodelslim.schemes import (
     ModelSlimScheme,
-    ModelSlimW8A8Int8,
     ModelSlimW4A4Int4,
+    ModelSlimW8A8Int8,
 )
-from sglang.srt.layers.quantization.compressed_tensors.utils import (
-    find_matched_target,
-    is_activation_quantization_format,
-    should_ignore_layer
-)
-#from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
+
+# from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
 from sglang.srt.utils import apply_module_patch
 
 logger = logging.getLogger(__name__)
 
+
 # func refers to RMSNorm.__init__
 def npu_wrapper_rmsnorm_init(func):
     def init(self, hidden_size: int, **extra_args) -> None:
@@ -51,6 +45,7 @@ def init(self, hidden_size: int, **extra_args) -> None:
 
     return init
 
+
 # func refers to RMSNorm.forward_oot
 def npu_wrapper_rmsnorm_forward(func):
     def _rmsnorm_forward_oot(
@@ -122,7 +117,7 @@ def __init__(self, quant_config: Dict[str, Any] = {}):
                     "forward_npu",
                     [npu_wrapper_rmsnorm_forward],
                 )
-    
+
     def get_linear_method(self) -> ModelSlimLinearMethod:
         return ModelSlimLinearMethod(self)
 
@@ -183,29 +178,28 @@ def get_quant_method(
                 return UnquantizedLinearMethod()
             scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config)
             if scheme is None:
-                raise NotImplementedError("At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes.")
+                raise NotImplementedError(
+                    "At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes."
+                )
             layer.scheme = scheme
-            return (
-                ModelSlimLinearMethod(self)
-            )
+            return ModelSlimLinearMethod(self)
         elif isinstance(layer, FusedMoE):
             return ModelSlimMoEMethod.get_moe_method(self, layer, prefix)
         return None
 
     def _get_scheme_from_parts(
-            self, layer_name: str,
-        ) -> ModelSlimScheme:
+        self,
+        layer_name: str,
+    ) -> ModelSlimScheme:
 
-        quant_type = self.quant_description[layer_name + '.weight']
+        quant_type = self.quant_description[layer_name + ".weight"]
         if quant_type == "W8A8_DYNAMIC" or quant_type == "W8A8":
             return ModelSlimW8A8Int8(
-                quant_config=self.quant_description,
-                prefix=layer_name
+                quant_config=self.quant_description, prefix=layer_name
             )
         elif quant_type == "W4A4_DYNAMIC":
             return ModelSlimW4A4Int4(
-                quant_config=self.quant_description,
-                prefix=layer_name
+                quant_config=self.quant_description, prefix=layer_name
             )
 
             # Detect If Mixed Precision
@@ -225,66 +219,66 @@ def _get_scheme_from_parts(
             #             "Other method (CompressedTensorsW4A16Sparse24) is not supported now"
             #         )
 
-            #if is_activation_quantization_format(self.quant_format):
-                # if self._is_fp8_w8a8(weight_quant, input_quant):
-                #     is_fp8_w8a8_supported = self._check_scheme_supported(
-                #         CompressedTensorsW8A8Fp8.get_min_capability(), error=False
-                #     )
-                #     if is_fp8_w8a8_supported:
-                #         return CompressedTensorsW8A8Fp8(
-                #             strategy=weight_quant.strategy,
-                #             is_static_input_scheme=(
-                #                 input_quant and not input_quant.dynamic
-                #             ),
-                #         )
-                #     else:
-                #         # note: input_quant will be present for converted models;
-                #         # will be ignored during inference post loading
-                #         return CompressedTensorsW8A16Fp8(
-                #             strategy=weight_quant.strategy,
-                #             is_static_input_scheme=not input_quant.dynamic,
-                #         )
-
-                # # note: input_quant can be None
-                # if self._is_fp8_w8a16(weight_quant, input_quant):
-                #     is_static_input_scheme = input_quant and not input_quant.dynamic
-                #     return CompressedTensorsW8A16Fp8(
-                #         strategy=weight_quant.strategy,
-                #         is_static_input_scheme=is_static_input_scheme,
-                #     )
-
-            #raise NotImplementedError("No msmodelslim compatible scheme was found.")
-    
-    def get_scheme(
-            self, layer: torch.nn.Module, layer_name: Optional[str] = None
-        ) -> Optional[ModelSlimScheme]:
-            """
-            get_scheme method adjusted for modelslim, taken from
-            python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
-            """
-            # if self.target_scheme_map:
-            #     matched_target = find_matched_target(
-            #         layer_name=layer_name,
-            #         module=layer,
-            #         targets=self.target_scheme_map.keys(),
-            #         fused_mapping=self.packed_modules_mapping,
+            # if is_activation_quantization_format(self.quant_format):
+            # if self._is_fp8_w8a8(weight_quant, input_quant):
+            #     is_fp8_w8a8_supported = self._check_scheme_supported(
+            #         CompressedTensorsW8A8Fp8.get_min_capability(), error=False
             #     )
+            #     if is_fp8_w8a8_supported:
+            #         return CompressedTensorsW8A8Fp8(
+            #             strategy=weight_quant.strategy,
+            #             is_static_input_scheme=(
+            #                 input_quant and not input_quant.dynamic
+            #             ),
+            #         )
+            #     else:
+            #         # note: input_quant will be present for converted models;
+            #         # will be ignored during inference post loading
+            #         return CompressedTensorsW8A16Fp8(
+            #             strategy=weight_quant.strategy,
+            #             is_static_input_scheme=not input_quant.dynamic,
+            #         )
 
-            #     scheme_dict = self.target_scheme_map[matched_target]
-            #     weight_quant = scheme_dict.get("weights")
-            #     input_quant = scheme_dict.get("input_activations")
-            # else:
-                # Find the quant_scheme
-            scheme = self._get_scheme_from_parts(  # type: ignore
-                # weight_quant=weight_quant,
-                # input_quant=input_quant,
-                layer_name=layer_name,
-            )
+            # # note: input_quant can be None
+            # if self._is_fp8_w8a16(weight_quant, input_quant):
+            #     is_static_input_scheme = input_quant and not input_quant.dynamic
+            #     return CompressedTensorsW8A16Fp8(
+            #         strategy=weight_quant.strategy,
+            #         is_static_input_scheme=is_static_input_scheme,
+            #     )
+
+            # raise NotImplementedError("No msmodelslim compatible scheme was found.")
+
+    def get_scheme(
+        self, layer: torch.nn.Module, layer_name: Optional[str] = None
+    ) -> Optional[ModelSlimScheme]:
+        """
+        get_scheme method adjusted for modelslim, taken from
+        python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+        """
+        # if self.target_scheme_map:
+        #     matched_target = find_matched_target(
+        #         layer_name=layer_name,
+        #         module=layer,
+        #         targets=self.target_scheme_map.keys(),
+        #         fused_mapping=self.packed_modules_mapping,
+        #     )
+
+        #     scheme_dict = self.target_scheme_map[matched_target]
+        #     weight_quant = scheme_dict.get("weights")
+        #     input_quant = scheme_dict.get("input_activations")
+        # else:
+        # Find the quant_scheme
+        scheme = self._get_scheme_from_parts(  # type: ignore
+            # weight_quant=weight_quant,
+            # input_quant=input_quant,
+            layer_name=layer_name,
+        )
 
-            # Ascend doesn't support device capability
-            # self._check_scheme_supported(scheme.get_min_capability())
-            logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
-            return scheme
+        # Ascend doesn't support device capability
+        # self._check_scheme_supported(scheme.get_min_capability())
+        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
+        return scheme
 
     def is_layer_skipped(
         self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index fe83dc771117..4b7b596c4f8d 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -2,25 +2,16 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import enum
 import logging
-from enum import Enum
-from typing import Callable, Optional, TYPE_CHECKING
-from typing import Any, Dict, List
+from typing import TYPE_CHECKING, Any, Dict
 
 import torch
 
-from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
-from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
-from sglang.srt.layers.quantization.msmodelslim.schemes import (
-    ModelSlimScheme,
-)
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
     NPUW4A8Int8DynamicMoEMethod,
     NPUW8A8Int8DynamicMoEMethod,
 )
-
+from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
 from sglang.srt.utils import set_weight_attrs
 
 if TYPE_CHECKING:
@@ -29,9 +20,7 @@
         CombineInput,
         StandardDispatchOutput,
     )
-    from sglang.srt.layers.quantization.msmodelslim.msmodelslim import (
-        ModelSlimConfig,
-    )
+    from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig
 
 logger = logging.getLogger(__name__)
 
@@ -73,7 +62,9 @@ def get_moe_method(
 class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod):
 
     def __init__(
-            self, quant_config: Dict[str, Any], prefix: str = None,
+        self,
+        quant_config: Dict[str, Any],
+        prefix: str = None,
     ):
         self.quant_config = quant_config
         self.group_size = 0
@@ -173,7 +164,9 @@ def create_weights(
                 ),
                 requires_grad=False,
             )
-            layer.register_parameter("w13_weight_offset_second", w13_weight_offset_second)
+            layer.register_parameter(
+                "w13_weight_offset_second", w13_weight_offset_second
+            )
             set_weight_attrs(w13_weight_offset_second, extra_weight_attrs)
 
             w2_weight_scale_second = torch.nn.Parameter(
@@ -225,14 +218,13 @@ def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
     ):
         self.moe_runner_config = moe_runner_config
-    
+
     def apply(
         self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
         return NPUW4A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
-    
 
     def apply_without_routing_weights(
         self,
@@ -243,18 +235,22 @@ def apply_without_routing_weights(
         group_list,
         output_dtype,
     ):
-        return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(layer,
-                                                                        hidden_states,
-                                                                        hidden_states_scale,
-                                                                        group_list_type,
-                                                                        group_list,
-                                                                        output_dtype,)
+        return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(
+            layer,
+            hidden_states,
+            hidden_states_scale,
+            group_list_type,
+            group_list,
+            output_dtype,
+        )
 
 
 class ModelSlimW8A8Int8MoE(ModelSlimMoEMethod):
 
     def __init__(
-            self, quant_config: Dict[str, Any], prefix: str = None,
+        self,
+        quant_config: Dict[str, Any],
+        prefix: str = None,
     ):
         self.quant_config = quant_config
 
@@ -335,14 +331,13 @@ def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
     ):
         self.moe_runner_config = moe_runner_config
-    
+
     def apply(
         self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
         return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
-    
 
     def apply_without_routing_weights(
         self,
@@ -353,9 +348,11 @@ def apply_without_routing_weights(
         group_list,
         output_dtype,
     ):
-        return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(layer,
-                                                                        hidden_states,
-                                                                        hidden_states_scale,
-                                                                        group_list_type,
-                                                                        group_list,
-                                                                        output_dtype,)
+        return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(
+            layer,
+            hidden_states,
+            hidden_states_scale,
+            group_list_type,
+            group_list,
+            output_dtype,
+        )
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
index 997892772977..fba516eed7c0 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from .msmodelslim_scheme import ModelSlimScheme
-from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8
 from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4
+from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8
 
 __all__ = [
     "ModelSlimScheme",
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
index 3bbbf4af1f2d..1b578837c8d4 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
@@ -1,40 +1,28 @@
 # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Callable, Optional
+from typing import Any, Dict, List, Optional
 
 import torch
-from torch.nn import Parameter
 
-from typing import Any, Dict, List
-
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
     NPU_W4A4DynamicLinearMethod,
 )
-from sglang.srt.layers.parameter import (
-    ChannelQuantScaleParameter,
-    ModelWeightParameter,
-    PerTensorScaleParameter,
-)
-from sglang.srt.layers.quantization.msmodelslim.schemes import (
-    ModelSlimScheme,
-)
-
+from sglang.srt.layers.parameter import PerTensorScaleParameter
+from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme
 from sglang.srt.utils import set_weight_attrs
 
 
 class ModelSlimW4A4Int4(ModelSlimScheme):
 
     def __init__(
-        self, quant_config: Dict[str, any], prefix: str,
+        self,
+        quant_config: Dict[str, any],
+        prefix: str,
     ):
         self.quant_config = quant_config
-        self.is_dynamic = (
-            self.quant_config[prefix + ".weight"]
-            == "W4A4_DYNAMIC"
-        )
-        
+        self.is_dynamic = self.quant_config[prefix + ".weight"] == "W4A4_DYNAMIC"
+
     @staticmethod
     def get_weight(
         input_size: int, output_size: int, params_dtype: torch.dtype
@@ -64,8 +52,12 @@ def create_weights(
     ) -> None:
         output_size_per_partition = sum(output_partition_sizes)
         weight_loader = extra_weight_attrs.get("weight_loader")
-        
-        weight_dict = {"weight": torch.empty(output_size_per_partition, input_size_per_partition, dtype=torch.int8)}
+
+        weight_dict = {
+            "weight": torch.empty(
+                output_size_per_partition, input_size_per_partition, dtype=torch.int8
+            )
+        }
         for weight_name, weight_param in weight_dict.items():
             param = torch.nn.Parameter(weight_param, requires_grad=False)
             set_weight_attrs(param, {"input_dim": 1, "output_dim": 0})
@@ -80,10 +72,14 @@ def create_weights(
             # disable warning
             param.ignore_warning = True
             layer.register_parameter(pertensor_name, param)
-        
+
         perchannel_dict = {}
-        perchannel_dict["weight_scale"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype)
-        perchannel_dict["weight_offset"] = torch.empty(output_size_per_partition, 1, dtype=params_dtype)
+        perchannel_dict["weight_scale"] = torch.empty(
+            output_size_per_partition, 1, dtype=params_dtype
+        )
+        perchannel_dict["weight_offset"] = torch.empty(
+            output_size_per_partition, 1, dtype=params_dtype
+        )
         for perchannel_name, perchannel_param in perchannel_dict.items():
             param = torch.nn.Parameter(perchannel_param, requires_grad=False)
             set_weight_attrs(param, {"output_dim": 0})
@@ -92,7 +88,7 @@ def create_weights(
 
     def process_weights_after_loading(self, layer):
         NPU_W4A4DynamicLinearMethod.process_weights_after_loading(layer)
-    
+
     def apply_weights(
         self,
         layer: torch.nn.Module,
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
index b33764b858a9..de99c9fed0b7 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
@@ -1,38 +1,31 @@
 # Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Callable, Optional
+from typing import Dict, List, Optional
 
 import torch
-from torch.nn import Parameter
 
-from typing import Any, Dict, List
-
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
     NPUW8A8Int8DynamicLinearMethod,
-    NPUW8A8Int8LinearMethod
+    NPUW8A8Int8LinearMethod,
 )
 from sglang.srt.layers.parameter import (
     ChannelQuantScaleParameter,
     ModelWeightParameter,
     PerTensorScaleParameter,
 )
-from sglang.srt.layers.quantization.msmodelslim.schemes import (
-    ModelSlimScheme,
-)
+from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme
 
 
 class ModelSlimW8A8Int8(ModelSlimScheme):
 
     def __init__(
-        self, quant_config: Dict[str, any], prefix: str,
+        self,
+        quant_config: Dict[str, any],
+        prefix: str,
     ):
         self.quant_config = quant_config
-        self.is_dynamic = (
-            self.quant_config[prefix + ".weight"]
-            == "W8A8_DYNAMIC"
-        )
+        self.is_dynamic = self.quant_config[prefix + ".weight"] == "W8A8_DYNAMIC"
 
     def create_weights(
         self,
@@ -70,7 +63,7 @@ def create_weights(
             weight_loader=weight_loader,
         )
         layer.register_parameter("weight_offset", weight_offset)
-        
+
         if not self.is_dynamic:
             input_scale = PerTensorScaleParameter(
                 data=torch.empty(1, dtype=params_dtype),
@@ -111,7 +104,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module):
             NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
         else:
             NPUW8A8Int8LinearMethod.process_weights_after_loading(layer)
-    
+
     def apply_weights(
         self,
         layer: torch.nn.Module,
@@ -121,4 +114,4 @@ def apply_weights(
         if self.is_dynamic:
             return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias)
         else:
-            return NPUW8A8Int8LinearMethod.apply(layer, x, bias)
\ No newline at end of file
+            return NPUW8A8Int8LinearMethod.apply(layer, x, bias)
diff --git a/test/manual/ascend/test_ascend_w8a8_quantization.py b/test/manual/ascend/test_ascend_w8a8_quantization.py
index 5c1bc3b66562..959bf88a513f 100644
--- a/test/manual/ascend/test_ascend_w8a8_quantization.py
+++ b/test/manual/ascend/test_ascend_w8a8_quantization.py
@@ -98,12 +98,12 @@ def test_throughput(self):
 
         if is_in_ci():
             self.assertGreaterEqual(throughput, 25)
-            
+
 
 class TestAscendW8A8CompressedTensors(CustomTestCase):
     @classmethod
     def setUpClass(cls):
-        #TODO: Move model to CI or Modelscope
+        # TODO: Move model to CI or Modelscope
         cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8"
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.process = popen_launch_server(

From a16b69e21dacbae5d371a14e1ec144ce23d85e03 Mon Sep 17 00:00:00 2001
From: TamirBaydasov <mr.jeijy@gmail.com>
Date: Wed, 17 Dec 2025 14:35:42 +0300
Subject: [PATCH 037/175] Fix model config loading, add NPU w8a8int8 MoE for
 compressed-tensors, fix for w8a8int8 linear schemes

---
 python/sglang/srt/configs/model_config.py     |  19 ++-
 .../npu/quantization/fused_moe_method_npu.py  |  15 ++-
 .../npu/quantization/linear_method_npu.py     |   8 +-
 .../compressed_tensors/compressed_tensors.py  |   4 +-
 .../compressed_tensors_moe.py                 | 127 +++++++++++++++++-
 .../schemes/compressed_tensors_w8a8_int8.py   |  24 ++--
 6 files changed, 170 insertions(+), 27 deletions(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 8e7e98a77d0b..1d1f81c87aa8 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -602,6 +602,9 @@ def _find_quant_modelslim_config(self):
         if quant_config_file.is_file():
             with open(quant_config_file) as f:
                 quant_cfg = json.load(f)
+            # This field is required for flagless model loading but is not present in
+            # modelslim model description, so we're adding it here manually.
+            quant_cfg['quant_method'] = 'modelslim'
 
         return quant_cfg
 
@@ -721,11 +724,17 @@ def _verify_quantization(self) -> None:
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
-        # Parse quantization method from the HF model config, if available.
-        quant_cfg = self._parse_quant_hf_config()
-        if _is_npu:
-            quant_cfg = self._find_quant_modelslim_config()
-            self.quantization = "modelslim"
+        # Parse quantization method from the HF and ModelSlim model config, if available.
+        # Only one function should return config, other should return None.
+        cfg_list = []
+        cfg_list.append(self._parse_quant_hf_config)
+        cfg_list.append(self._find_quant_modelslim_config)
+
+        # Filter out None values
+        cfg_list = [item for item in cfg_list if item is not None]
+        assert (len(cfg_list) == 1), "Config list contains configs from 2 methods, must be only 1"
+        
+        quant_cfg = cfg_list[0]
 
         if quant_cfg is not None:
             quant_method = quant_cfg.get(
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index dfbab790d1ed..db00f47f90d5 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -165,12 +165,15 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
         layer.w2_weight_scale = torch.nn.Parameter(
             layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False
         )
-        layer.w13_weight_offset = torch.nn.Parameter(
-            layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
-        )
-        layer.w2_weight_offset = torch.nn.Parameter(
-            layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
-        )
+        # Compressed-tensors format doesn't have this field
+        if hasattr(layer, "w13_weight_offset"):
+            layer.w13_weight_offset = torch.nn.Parameter(
+                layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+            )
+        if hasattr(layer, "w2_weight_offset"):
+            layer.w2_weight_offset = torch.nn.Parameter(
+                layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+            )
 
         layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
         layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
index 681d45d18f0b..2d70834caf0b 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
@@ -59,7 +59,9 @@ def process_weights_after_loading(layer: torch.nn.Module):
         layer.weight.data = npu_format_cast(layer.weight.data)
 
         layer.weight_scale.data = layer.weight_scale.data.flatten()
-        layer.weight_offset.data = layer.weight_offset.data.flatten()
+        # Compressed-tensors format doesn't have this field
+        if hasattr(layer, "weight_offset"):
+            layer.weight_offset.data = layer.weight_offset.data.flatten()
 
         expanding_factor = layer.weight.data.shape[0]
         layer.aclnn_input_scale = torch.nn.Parameter(
@@ -101,7 +103,9 @@ def process_weights_after_loading(layer: torch.nn.Module):
         layer.weight.data = npu_format_cast(layer.weight.data)
 
         layer.weight_scale.data = layer.weight_scale.data.flatten()
-        layer.weight_offset.data = layer.weight_offset.data.flatten()
+        # Compressed-tensors format doesn't have this field
+        if hasattr(layer, "weight_offset"):
+            layer.weight_offset.data = layer.weight_offset.data.flatten()
 
 
 class NPU_W4A4DynamicLinearMethod(_NPULinearMethodBase):
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index 4e10c5d734eb..f97854839b2c 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -588,7 +588,9 @@ def get_scheme(
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
-        self._check_scheme_supported(scheme.get_min_capability())
+        # Note: NPU devices do not support min_capability function
+        if not _is_npu:
+            self._check_scheme_supported(scheme.get_min_capability())
         logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
         return scheme
 
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index b5e3964c85f4..3ec4a45f43f9 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -15,6 +15,9 @@
 from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
 from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
 from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+    NPUW8A8Int8DynamicMoEMethod,
+)
 from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
 from sglang.srt.layers.quantization.compressed_tensors.schemes import (
     WNA16_SUPPORTED_BITS,
@@ -32,7 +35,7 @@
     replace_parameter,
     swizzle_blockscale,
 )
-from sglang.srt.utils import get_bool_env_var, is_cuda, is_hip, set_weight_attrs
+from sglang.srt.utils import get_bool_env_var, is_cuda, is_npu, is_hip, set_weight_attrs
 
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
@@ -45,6 +48,7 @@
     )
 
 _is_hip = is_hip()
+_is_npu = is_npu()
 _is_cuda = is_cuda()
 
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
@@ -67,6 +71,7 @@ class GPTQMarlinState(Enum):
     "CompressedTensorsMoEMethod",
     "CompressedTensorsW4A4Nvfp4MoEMethod",
     "CompressedTensorsW8A8Fp8MoEMethod",
+    "NPUCompressedTensorsW8A8Int8MoEMethod"
     "CompressedTensorsWNA16MoEMethod",
 ]
 
@@ -98,6 +103,12 @@ def get_moe_method(
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
             logger.info_once("Using CompressedTensorsW8A8Fp8MoEMethod")
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
+        elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
+            if _is_npu:
+                logger.info_once("Using NPUCompressedTensorsW8A8Int8MoEMethod")
+                return NPUCompressedTensorsW8A8Int8MoEMethod(quant_config)
+            else:
+                raise NotImplementedError(f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now.")
         else:
             raise RuntimeError(
                 f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
@@ -681,6 +692,120 @@ def apply(
             return self.runner.run(dispatch_output, quant_info)
 
 
+class NPUCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(self, quant_config: CompressedTensorsConfig):
+        self.quant_config = quant_config
+        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
+        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "input_activations"
+        )
+        if not _is_npu:
+            raise NotImplementedError(
+                "w8a8 int8 compressed tensors moe scheme is supported only for Ascend device for now."
+            )
+        self.static_input_scales = not self.input_quant.dynamic
+        per_channel = (
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN
+        )
+        if not per_channel:
+            raise ValueError(
+                "For INT8 Fused MoE layers, we require channelwise, "
+                "dynamic per token quantization. Found "
+                f"{self.weight_quant}, {self.input_quant}"
+            )
+
+        self.static_input_scales = not self.input_quant.dynamic
+        if self.static_input_scales:
+            raise ValueError(
+                "For INT8 Fused MoE layers, we require channelwise, "
+                "dynamic per token quantization. Found static input scales."
+            )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        params_dtype = torch.int8
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        assert self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        assert not self.static_input_scales
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        NPUW8A8Int8DynamicMoEMethod.process_weights_after_loading(layer)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
+
+
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(self, quant_config: CompressedTensorsConfig, num_gpu_experts=-1):
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 64401aea6a71..73c24aec92e5 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -93,12 +93,12 @@ def create_weights(
                 layer.register_parameter("input_zero_point", input_zero_point)
 
 
-class GPUCompressedTensorsW8A8Int8(CompressedTensorsScheme):
+class GPUCompressedTensorsW8A8Int8(CompressedTensorsW8A8Int8):
 
     def __init__(
         self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
     ):
-        super.__init__(strategy, is_static_input_scheme, input_symmetric)
+        super().__init__(strategy, is_static_input_scheme, input_symmetric)
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -185,25 +185,25 @@ def apply_weights(
         )
 
 
-class NPUCompressedTensorsW8A8Int8(CompressedTensorsScheme):
+class NPUCompressedTensorsW8A8Int8(CompressedTensorsW8A8Int8):
 
     def __init__(
         self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
     ):
-        super.__init__(strategy, is_static_input_scheme, input_symmetric)
+        super().__init__(strategy, is_static_input_scheme, input_symmetric)
+        # TODO: Currently, NPU kernel for static quant requires quant_bias field,
+        # which can't be replicated in compressed-tensors.
+        if self.is_static_input_scheme:
+            raise NotImplementedError(
+                "Static compressed-tensors scheme is not yet supported on NPU."
+            )
 
     @classmethod
     def get_min_capability(cls) -> int:
         return NotImplementedError
 
     def process_weights_after_loading(self, layer):
-        if self.is_static_input_scheme:
-            return NPUW8A8Int8LinearMethod.process_weights_after_loading(layer)
-        else:
-            return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
+        return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
 
     def apply_weights(self, layer, x, bias):
-        if self.is_static_input_scheme:
-            return NPUW8A8Int8LinearMethod.apply(layer)
-        else:
-            return NPUW8A8Int8DynamicLinearMethod.apply(layer)
+        return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias)

From 238759c5fa873af55af3bd370a69fc8cd94f0a8d Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Wed, 17 Dec 2025 14:45:40 +0300
Subject: [PATCH 038/175] Pre-commit fixes

---
 python/sglang/srt/configs/model_config.py         |  8 +++++---
 .../npu/quantization/fused_moe_method_npu.py      |  6 ++++--
 .../compressed_tensors/compressed_tensors_moe.py  | 15 ++++++++-------
 .../schemes/compressed_tensors_w8a8_int8.py       |  1 -
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 1d1f81c87aa8..5e5eba62a295 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -604,7 +604,7 @@ def _find_quant_modelslim_config(self):
                 quant_cfg = json.load(f)
             # This field is required for flagless model loading but is not present in
             # modelslim model description, so we're adding it here manually.
-            quant_cfg['quant_method'] = 'modelslim'
+            quant_cfg["quant_method"] = "modelslim"
 
         return quant_cfg
 
@@ -732,8 +732,10 @@ def _verify_quantization(self) -> None:
 
         # Filter out None values
         cfg_list = [item for item in cfg_list if item is not None]
-        assert (len(cfg_list) == 1), "Config list contains configs from 2 methods, must be only 1"
-        
+        assert (
+            len(cfg_list) == 1
+        ), "Config list contains configs from 2 methods, must be only 1"
+
         quant_cfg = cfg_list[0]
 
         if quant_cfg is not None:
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index db00f47f90d5..71ab140c1f02 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -168,11 +168,13 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
         # Compressed-tensors format doesn't have this field
         if hasattr(layer, "w13_weight_offset"):
             layer.w13_weight_offset = torch.nn.Parameter(
-                layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+                layer.w13_weight_offset.data.squeeze(-1).contiguous(),
+                requires_grad=False,
             )
         if hasattr(layer, "w2_weight_offset"):
             layer.w2_weight_offset = torch.nn.Parameter(
-                layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+                layer.w2_weight_offset.data.squeeze(-1).contiguous(),
+                requires_grad=False,
             )
 
         layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 3ec4a45f43f9..9e73f5ac10ef 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -12,12 +12,12 @@
 from compressed_tensors.quantization import QuantizationStrategy
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
-from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
-from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
-from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
 from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
     NPUW8A8Int8DynamicMoEMethod,
 )
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
 from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
 from sglang.srt.layers.quantization.compressed_tensors.schemes import (
     WNA16_SUPPORTED_BITS,
@@ -35,7 +35,7 @@
     replace_parameter,
     swizzle_blockscale,
 )
-from sglang.srt.utils import get_bool_env_var, is_cuda, is_npu, is_hip, set_weight_attrs
+from sglang.srt.utils import get_bool_env_var, is_cuda, is_hip, is_npu, set_weight_attrs
 
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
@@ -71,8 +71,7 @@ class GPTQMarlinState(Enum):
     "CompressedTensorsMoEMethod",
     "CompressedTensorsW4A4Nvfp4MoEMethod",
     "CompressedTensorsW8A8Fp8MoEMethod",
-    "NPUCompressedTensorsW8A8Int8MoEMethod"
-    "CompressedTensorsWNA16MoEMethod",
+    "NPUCompressedTensorsW8A8Int8MoEMethod" "CompressedTensorsWNA16MoEMethod",
 ]
 
 
@@ -108,7 +107,9 @@ def get_moe_method(
                 logger.info_once("Using NPUCompressedTensorsW8A8Int8MoEMethod")
                 return NPUCompressedTensorsW8A8Int8MoEMethod(quant_config)
             else:
-                raise NotImplementedError(f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now.")
+                raise NotImplementedError(
+                    f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now."
+                )
         else:
             raise RuntimeError(
                 f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 73c24aec92e5..d307f6b01c33 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -9,7 +9,6 @@
 
 from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
     NPUW8A8Int8DynamicLinearMethod,
-    NPUW8A8Int8LinearMethod,
 )
 from sglang.srt.layers.parameter import (
     ChannelQuantScaleParameter,

From 5ca19cb80e56f50861903694674cf68e399141cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Wed, 17 Dec 2025 16:20:54 +0300
Subject: [PATCH 039/175] Delete comments

---
 .../quantization/msmodelslim/msmodelslim.py   | 115 ------------------
 1 file changed, 115 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index 7825b3fd2027..111fda21c02e 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -202,52 +202,6 @@ def _get_scheme_from_parts(
                 quant_config=self.quant_description, prefix=layer_name
             )
 
-            # Detect If Mixed Precision
-            # if self._is_wNa16_group_channel(weight_quant, input_quant):
-            #     if (
-            #         self.quant_format == CompressionFormat.pack_quantized.value
-            #         and weight_quant.num_bits in WNA16_SUPPORTED_BITS
-            #     ):
-            #         return CompressedTensorsWNA16(
-            #             num_bits=weight_quant.num_bits,
-            #             strategy=weight_quant.strategy,
-            #             group_size=weight_quant.group_size,
-            #             actorder=weight_quant.actorder,
-            #         )
-            #     else:
-            #         raise ImportError(
-            #             "Other method (CompressedTensorsW4A16Sparse24) is not supported now"
-            #         )
-
-            # if is_activation_quantization_format(self.quant_format):
-            # if self._is_fp8_w8a8(weight_quant, input_quant):
-            #     is_fp8_w8a8_supported = self._check_scheme_supported(
-            #         CompressedTensorsW8A8Fp8.get_min_capability(), error=False
-            #     )
-            #     if is_fp8_w8a8_supported:
-            #         return CompressedTensorsW8A8Fp8(
-            #             strategy=weight_quant.strategy,
-            #             is_static_input_scheme=(
-            #                 input_quant and not input_quant.dynamic
-            #             ),
-            #         )
-            #     else:
-            #         # note: input_quant will be present for converted models;
-            #         # will be ignored during inference post loading
-            #         return CompressedTensorsW8A16Fp8(
-            #             strategy=weight_quant.strategy,
-            #             is_static_input_scheme=not input_quant.dynamic,
-            #         )
-
-            # # note: input_quant can be None
-            # if self._is_fp8_w8a16(weight_quant, input_quant):
-            #     is_static_input_scheme = input_quant and not input_quant.dynamic
-            #     return CompressedTensorsW8A16Fp8(
-            #         strategy=weight_quant.strategy,
-            #         is_static_input_scheme=is_static_input_scheme,
-            #     )
-
-            # raise NotImplementedError("No msmodelslim compatible scheme was found.")
 
     def get_scheme(
         self, layer: torch.nn.Module, layer_name: Optional[str] = None
@@ -256,19 +210,6 @@ def get_scheme(
         get_scheme method adjusted for modelslim, taken from
         python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
         """
-        # if self.target_scheme_map:
-        #     matched_target = find_matched_target(
-        #         layer_name=layer_name,
-        #         module=layer,
-        #         targets=self.target_scheme_map.keys(),
-        #         fused_mapping=self.packed_modules_mapping,
-        #     )
-
-        #     scheme_dict = self.target_scheme_map[matched_target]
-        #     weight_quant = scheme_dict.get("weights")
-        #     input_quant = scheme_dict.get("input_activations")
-        # else:
-        # Find the quant_scheme
         scheme = self._get_scheme_from_parts(  # type: ignore
             # weight_quant=weight_quant,
             # input_quant=input_quant,
@@ -314,62 +255,6 @@ def is_layer_skipped(
     def get_scaled_act_names(self) -> List[str]:
         return []
 
-    # def is_dynamic_token_w4(self, weight_quant, input_quant) -> bool:
-    #     is_w4 = weight_quant.num_bits == 4
-    #     weight_strategy = (
-    #         weight_quant.strategy == QuantizationStrategy.TENSOR.value
-    #         or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
-    #         or weight_quant.strategy == QuantizationStrategy.GROUP.value
-    #     )
-    #     if input_quant is not None:
-    #         is_token = (
-    #             weight_strategy
-    #             and input_quant.strategy == QuantizationStrategy.TOKEN.value
-    #         )
-    #         is_dynamic = not weight_quant.dynamic and input_quant.dynamic
-    #     else:
-    #         is_token = weight_strategy
-    #         is_dynamic = not weight_quant.dynamic
-
-    #     # Both symmetric and asymmetric input quantization supported.
-    #     # Only symmetric weight quantization supported.
-    #     return is_w4 and weight_quant.symmetric and is_token and is_dynamic
-
-    # def _is_static_tensor_w8a8(
-    #     self, weight_quant: BaseModel, input_quant: BaseModel
-    # ) -> bool:
-    #     is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-    #     weight_strategy = (
-    #         weight_quant.strategy == QuantizationStrategy.TENSOR.value
-    #         or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
-    #     )
-    #     is_tensor = (
-    #         weight_strategy
-    #         and input_quant.strategy == QuantizationStrategy.TENSOR.value
-    #     )
-    #     is_static = not weight_quant.dynamic and not input_quant.dynamic
-
-    #     # Both symmetric and asymmetric input quantization supported.
-    #     # Only symmetric weight quantization supported.
-    #     return is_8_bits and is_tensor and weight_quant.symmetric and is_static
-
-    # def _is_dynamic_token_w8a8(
-    #     self, weight_quant: BaseModel, input_quant: BaseModel
-    # ) -> bool:
-    #     is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-    #     weight_strategy = (
-    #         weight_quant.strategy == QuantizationStrategy.TENSOR.value
-    #         or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
-    #     )
-    #     is_token = (
-    #         weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
-    #     )
-    #     is_dynamic = not weight_quant.dynamic and input_quant.dynamic
-
-    #     # Both symmetric and asymmetric input quantization supported.
-    #     # Only symmetric weight quantization supported.
-    #     return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
-
 
 class ModelSlimLinearMethod(_NPULinearMethodBase):
 

From 1f18881992df3496987194507cb3f0af3fefd5b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Wed, 17 Dec 2025 16:22:05 +0300
Subject: [PATCH 040/175] Delete comments

---
 .../quantization/msmodelslim/msmodelslim.py   | 31 ++-----------------
 1 file changed, 2 insertions(+), 29 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index 111fda21c02e..78e5b2d66ce8 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -6,12 +6,7 @@
 
 import torch
 
-# from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
-#     NPUW4A8Int4DynamicMoEMethod,
-#     NPUW4A16Int4DynamicMoEMethod,
-#     NPUW8A8Int8DynamicMoEMethod,
-# )
-from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (  # NPUW8A8Int8DynamicLinearMethod,; NPUW8A8Int8LinearMethod,
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
     _NPULinearMethodBase,
 )
 from sglang.srt.layers.quantization.base_config import (
@@ -28,7 +23,6 @@
     ModelSlimW8A8Int8,
 )
 
-# from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
 from sglang.srt.utils import apply_module_patch
 
@@ -90,20 +84,6 @@ def __init__(self, quant_config: Dict[str, Any] = {}):
         self.packed_modules_mapping = (
             packed_modules_mapping if packed_modules_mapping is not None else {}
         )
-        # self.target_scheme_map = (
-        #     CompressedTensorsConfig._quantization_scheme_map_from_config(
-        #         config=quant_config
-        #     )
-        # )
-        # target = "MoEGMM" if "MoEGMM" in self.target_scheme_map else "Linear"
-        # target_scheme = self.target_scheme_map.get(target, None)
-        # if target_scheme is None:
-        #     self.is_moe_w4_dynamic = False
-        # else:
-        #     weight_quant = target_scheme.get("weights")
-        #     input_quant = target_scheme.get("input_activations")
-        #     self.is_moe_w4_dynamic = self.is_dynamic_token_w4(weight_quant, input_quant)
-        #     self.is_moe_input_quant = input_quant
 
         for name in self.quant_description.keys():
             if "norm.bias" in name:
@@ -169,10 +149,6 @@ def get_quant_method(
                 prefix_in_quant_config = prefix.replace(
                     proj_name, packed_modules_mapping_subset[proj_name][0]
                 )
-            # self.is_dynamic = (
-            #     self.quant_description[prefix_in_quant_config + ".weight"]
-            #     == "W8A8_DYNAMIC"
-            # )
 
             if self.is_layer_skipped(prefix, packed_modules_mapping_subset):
                 return UnquantizedLinearMethod()
@@ -210,14 +186,11 @@ def get_scheme(
         get_scheme method adjusted for modelslim, taken from
         python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
         """
-        scheme = self._get_scheme_from_parts(  # type: ignore
-            # weight_quant=weight_quant,
-            # input_quant=input_quant,
+        scheme = self._get_scheme_from_parts(
             layer_name=layer_name,
         )
 
         # Ascend doesn't support device capability
-        # self._check_scheme_supported(scheme.get_min_capability())
         logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
         return scheme
 

From 2bee5c7f3ddb396bf5b92fd908843babb7e882f2 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Wed, 17 Dec 2025 16:37:07 +0300
Subject: [PATCH 041/175] Update model_config.py

---
 python/sglang/srt/configs/model_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 5e5eba62a295..ef7bc4bdcc3a 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -727,8 +727,8 @@ def _verify_quantization(self) -> None:
         # Parse quantization method from the HF and ModelSlim model config, if available.
         # Only one function should return config, other should return None.
         cfg_list = []
-        cfg_list.append(self._parse_quant_hf_config)
-        cfg_list.append(self._find_quant_modelslim_config)
+        cfg_list.append(self._parse_quant_hf_config())
+        cfg_list.append(self._find_quant_modelslim_config())
 
         # Filter out None values
         cfg_list = [item for item in cfg_list if item is not None]

From 2670aa96c74fc3e144ff577790350901a30d92d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Wed, 17 Dec 2025 17:01:48 +0300
Subject: [PATCH 042/175] Quickfix

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 71ab140c1f02..9b4ad95dbd12 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -344,8 +344,7 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
         layer.w13_weight.data = cls.pack_to_int32(layer.w13_weight.data)
         layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data)
 
-    staticmethod
-
+    @staticmethod
     def apply(
         layer,
         dispatch_output: "StandardDispatchOutput",

From 1e45ead569332edd54bb6611786d0bf6695061d6 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Wed, 17 Dec 2025 17:38:18 +0300
Subject: [PATCH 043/175] Update fused_moe_method_npu.py

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 9b4ad95dbd12..ce17188d71c8 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -431,7 +431,6 @@ def apply(
 
     @staticmethod
     def apply_without_routing_weights(
-        cls,
         layer,
         hidden_states,
         hidden_states_scale,

From afc11a67f5d0bd9159b4a21d8ba2441313f88d06 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Wed, 17 Dec 2025 19:59:29 +0300
Subject: [PATCH 044/175] Update CODEOWNERS

---
 .github/CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 898a44404431..d86c5a9519d7 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -15,6 +15,7 @@
 /python/sglang/srt/function_call @CatherineSue @JustinTong0323
 /python/sglang/srt/grpc @CatherineSue @slin1237
 /python/sglang/srt/hardware_backend/npu @ping1jing2 @iforgetmyname
+/python/sglang/srt/hardware_backend/npu/quantization @OrangeRedeng @TamirBaydasov @iforgetmyname 
 /python/sglang/srt/layers @merrymercy @Ying1123 @Fridge003 @ispobock @HaiShaw @ch-wan @BBuf @Edwardf0t1
 /python/sglang/srt/layers/attention/fla @yizhang2077 @hebiao064
 /python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py @yizhang2077 @hebiao064 @hanming-lu

From 168b2a84aa86bc3d8a6125ffad9dd420fe718c80 Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Wed, 17 Dec 2025 20:31:51 +0300
Subject: [PATCH 045/175] Pre-commit fixes

---
 .github/CODEOWNERS                                              | 2 +-
 .../sglang/srt/layers/quantization/msmodelslim/msmodelslim.py   | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d86c5a9519d7..e88cfe589ab6 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -15,7 +15,7 @@
 /python/sglang/srt/function_call @CatherineSue @JustinTong0323
 /python/sglang/srt/grpc @CatherineSue @slin1237
 /python/sglang/srt/hardware_backend/npu @ping1jing2 @iforgetmyname
-/python/sglang/srt/hardware_backend/npu/quantization @OrangeRedeng @TamirBaydasov @iforgetmyname 
+/python/sglang/srt/hardware_backend/npu/quantization @OrangeRedeng @TamirBaydasov @iforgetmyname
 /python/sglang/srt/layers @merrymercy @Ying1123 @Fridge003 @ispobock @HaiShaw @ch-wan @BBuf @Edwardf0t1
 /python/sglang/srt/layers/attention/fla @yizhang2077 @hebiao064
 /python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py @yizhang2077 @hebiao064 @hanming-lu
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index 78e5b2d66ce8..1ba64f7c4601 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -22,7 +22,6 @@
     ModelSlimW4A4Int4,
     ModelSlimW8A8Int8,
 )
-
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
 from sglang.srt.utils import apply_module_patch
 
@@ -178,7 +177,6 @@ def _get_scheme_from_parts(
                 quant_config=self.quant_description, prefix=layer_name
             )
 
-
     def get_scheme(
         self, layer: torch.nn.Module, layer_name: Optional[str] = None
     ) -> Optional[ModelSlimScheme]:

From d5516526e4977e02c3e9261af6750cc7cabfa26d Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 18 Dec 2025 12:31:03 +0300
Subject: [PATCH 046/175] Update msmodelslim_w8a8_int8.py

---
 .../quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
index de99c9fed0b7..c462b2a66bea 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
@@ -25,7 +25,7 @@ def __init__(
         prefix: str,
     ):
         self.quant_config = quant_config
-        self.is_dynamic = self.quant_config[prefix + ".weight"] == "W8A8_DYNAMIC"
+        self.is_dynamic = self.quant_config.get(prefix + ".weight", "") == "W8A8_DYNAMIC"
 
     def create_weights(
         self,

From 1cf18c0f7b3ab597bc24621aed607c7945b961dd Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 18 Dec 2025 12:32:25 +0300
Subject: [PATCH 047/175] Update msmodelslim.py

---
 .../srt/layers/quantization/msmodelslim/msmodelslim.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index 1ba64f7c4601..5eb341415d1e 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -167,7 +167,7 @@ def _get_scheme_from_parts(
         layer_name: str,
     ) -> ModelSlimScheme:
 
-        quant_type = self.quant_description[layer_name + ".weight"]
+        quant_type = self.quant_description.get(layer_name + ".weight", "")
         if quant_type == "W8A8_DYNAMIC" or quant_type == "W8A8":
             return ModelSlimW8A8Int8(
                 quant_config=self.quant_description, prefix=layer_name
@@ -206,7 +206,7 @@ def is_layer_skipped(
             is_skipped = None
             for shard_prefix in shard_prefixes:
                 is_shard_skipped = (
-                    self.quant_description[shard_prefix + ".weight"] == "FLOAT"
+                    self.quant_description.get(shard_prefix + ".weight", "") == "FLOAT"
                 )
 
                 if is_skipped is None:
@@ -218,7 +218,7 @@ def is_layer_skipped(
                         "to have the same precision."
                     )
         else:
-            is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT"
+            is_skipped = self.quant_description.get(prefix + ".weight", "") == "FLOAT"
 
         assert is_skipped is not None
         return is_skipped

From 3dccf89b5a5138826bc6ffb60bd58dab4889b22a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 14:08:02 +0300
Subject: [PATCH 048/175] Delete
 python/sglang/srt/hardware_backend/npu/quantization/modelslim.py

---
 python/sglang/srt/hardware_backend/npu/quantization/modelslim.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 python/sglang/srt/hardware_backend/npu/quantization/modelslim.py

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py b/python/sglang/srt/hardware_backend/npu/quantization/modelslim.py
deleted file mode 100644
index e69de29bb2d1..000000000000

From 1842d0a521e4c15e38a7b237496c01e0b978d2a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 14:15:02 +0300
Subject: [PATCH 049/175] Removed unused code

---
 .../sglang/srt/layers/quantization/msmodelslim/msmodelslim.py   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index 5eb341415d1e..dc43e6b79b5d 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -75,8 +75,6 @@ class ModelSlimConfig(QuantizationConfig):
     def __init__(self, quant_config: Dict[str, Any] = {}):
         super().__init__()
         self.quant_description = quant_config
-        # self.is_dynamic = quant_config.get("is_dynamic", False)
-        # self.is_moe_w4_dynamic = False
         ignore = cast(List[str], quant_config.get("ignore", []))
         self.ignore = ignore if ignore is not None else []
         packed_modules_mapping = quant_config.get("packed_modules_mapping", {})

From 75de787a1087c42b9a7001fb80064e8d8cf78d94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 16:28:54 +0300
Subject: [PATCH 050/175] Remove --quantization modelslim flag from doc

---
 docs/platforms/ascend_npu_deepseek_example.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/docs/platforms/ascend_npu_deepseek_example.md b/docs/platforms/ascend_npu_deepseek_example.md
index acb864ef568e..08bc98613c23 100644
--- a/docs/platforms/ascend_npu_deepseek_example.md
+++ b/docs/platforms/ascend_npu_deepseek_example.md
@@ -30,7 +30,6 @@ python3 -m sglang.launch_server \
     --trust-remote-code \
     --attention-backend ascend \
     --device npu \
-    --quantization modelslim \
     --watchdog-timeout 9000 \
     --host 127.0.0.1 \
     --port 6688 \
@@ -89,7 +88,6 @@ python -m sglang.launch_server \
     --mem-fraction-static 0.6 \
     --attention-backend ascend \
     --device npu \
-    --quantization modelslim \
     --disaggregation-transfer-backend ascend \
     --max-running-requests 8 \
     --context-length 8192 \
@@ -145,7 +143,6 @@ python -m sglang.launch_server \
     --max-running-requests 352 \
     --attention-backend ascend \
     --device npu \
-    --quantization modelslim \
     --moe-a2a-backend deepep \
     --enable-dp-attention \
     --deepep-mode low_latency \
@@ -214,7 +211,6 @@ do
       --mem-fraction-static 0.81 \
       --attention-backend ascend \
       --device npu \
-      --quantization modelslim \
       --disaggregation-transfer-backend ascend \
       --max-running-requests 8 \
       --context-length 8192 \
@@ -275,7 +271,6 @@ do
       --max-running-requests 832 \
       --attention-backend ascend \
       --device npu \
-      --quantization modelslim \
       --moe-a2a-backend deepep \
       --enable-dp-attention \
       --deepep-mode low_latency \

From e9587675cd85f2989a5d6eca20702c693a2d7273 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 16:30:42 +0300
Subject: [PATCH 051/175] Delete --quantization "modelslim" flag

---
 test/srt/ascend/test_ascend_deepep.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/srt/ascend/test_ascend_deepep.py b/test/srt/ascend/test_ascend_deepep.py
index c5f2cb6faa4a..8822fc2f8278 100644
--- a/test/srt/ascend/test_ascend_deepep.py
+++ b/test/srt/ascend/test_ascend_deepep.py
@@ -34,8 +34,6 @@ def setUpClass(cls):
             "--trust-remote-code",
             "--attention-backend",
             "ascend",
-            "--quantization",
-            "modelslim",
             "--mem-fraction-static",
             0.8,
             "--disable-radix-cache",

From 15678852e1ec86f1892169412bc4983678dd4692 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 16:31:05 +0300
Subject: [PATCH 052/175] Delete --quantization "modelslim" flag

---
 test/srt/ascend/test_ascend_deepseek_mtp.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/srt/ascend/test_ascend_deepseek_mtp.py b/test/srt/ascend/test_ascend_deepseek_mtp.py
index 43089f885e97..d08329481d4f 100644
--- a/test/srt/ascend/test_ascend_deepseek_mtp.py
+++ b/test/srt/ascend/test_ascend_deepseek_mtp.py
@@ -32,8 +32,6 @@ def setUpClass(cls):
             "--trust-remote-code",
             "--attention-backend",
             "ascend",
-            "--quantization",
-            "modelslim",
             "--mem-fraction-static",
             0.8,
             "--disable-radix-cache",

From d34cb6fba083f62ac9891a0b7b0b7941b67cee8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 16:31:23 +0300
Subject: [PATCH 053/175] Update test_ascend_hicache_mla.py

---
 test/srt/ascend/test_ascend_hicache_mla.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/srt/ascend/test_ascend_hicache_mla.py b/test/srt/ascend/test_ascend_hicache_mla.py
index 5e7c711e868d..d0bc1f378cfa 100644
--- a/test/srt/ascend/test_ascend_hicache_mla.py
+++ b/test/srt/ascend/test_ascend_hicache_mla.py
@@ -35,8 +35,6 @@ def setUpClass(cls):
             0.8,
             "--attention-backend",
             "ascend",
-            "--quantization",
-            "modelslim",
             "--tp-size",
             4,
             "--enable-hierarchical-cache",

From 09a6d445795229410324feb652f8b965554fc261 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 16:32:08 +0300
Subject: [PATCH 054/175] Delete --quantization "modelslim" flag

---
 test/srt/ascend/test_ascend_mla_fia_w8a8int8.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/srt/ascend/test_ascend_mla_fia_w8a8int8.py b/test/srt/ascend/test_ascend_mla_fia_w8a8int8.py
index 1a0eb7f6dd05..bdab4ea05781 100644
--- a/test/srt/ascend/test_ascend_mla_fia_w8a8int8.py
+++ b/test/srt/ascend/test_ascend_mla_fia_w8a8int8.py
@@ -37,8 +37,6 @@ def setUpClass(cls):
             0.8,
             "--attention-backend",
             "ascend",
-            "--quantization",
-            "modelslim",
             "--tp-size",
             2,
             "--disable-radix-cache",

From 2b7003e7f77b1507a48d6bc9fa5499840b70a32d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 16:32:32 +0300
Subject: [PATCH 055/175] Update test_ascend_mla_w8a8int8.py

---
 test/srt/ascend/test_ascend_mla_w8a8int8.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/srt/ascend/test_ascend_mla_w8a8int8.py b/test/srt/ascend/test_ascend_mla_w8a8int8.py
index eddae3086c6d..3c3e733669ea 100644
--- a/test/srt/ascend/test_ascend_mla_w8a8int8.py
+++ b/test/srt/ascend/test_ascend_mla_w8a8int8.py
@@ -36,8 +36,6 @@ def setUpClass(cls):
             0.8,
             "--attention-backend",
             "ascend",
-            "--quantization",
-            "modelslim",
             "--tp-size",
             4,
             "--disable-radix-cache",

From 43b5d66d15fae0678e5021b8fbdfd2718aca9d98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 17:07:37 +0300
Subject: [PATCH 056/175] Create README.md for msModelSlim

---
 .../layers/quantization/msmodelslim/README.md | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 python/sglang/srt/layers/quantization/msmodelslim/README.md

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md
new file mode 100644
index 000000000000..9eaa9ab248b0
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md
@@ -0,0 +1,57 @@
+Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module.
+
+MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as:
+- [x] W4A4 dynamic linear
+- [x] W8A8 static linear
+- [x] W8A8 dynamic linear 
+- [x] W4A8 dynamic MOE
+- [x] W8A8 dynamic MOE
+
+Also MsModelSlim module include:
+- [x] Automated config detection for modelslim format (without the need to specify --quantization modelslim flag)
+- [x] Unit-tests for w4a4 modelslim, w8a8 modelslim
+
+Examples of launch:
+server: 
+`SGLANG_SET_CPU_AFFINITY=1
+PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+STREAMS_PER_DEVICE=32
+HCCL_BUFFSIZE=1536
+ENABLE_ASCEND_MOE_NZ=1
+ASCEND_RT_VISIBLE_DEVICES=0,1 
+python3 -m sglang.launch_server --device npu --attention-backend ascend --trust-remote-code --tp-size 2 --model-path *model* --port 30088 --mem-fraction-static 0.8 --cuda-graph-max-bs 16`
+
+client: 
+`python ./benchmark/gsm8k/bench_sglang.py --num-questions 1319 --port 30088 --data-path ../gsm8k/test.jsonl --parallel 16`
+
+<!-- If this pull request affects model outputs (e.g., changes to the kernel or model forward code), provide accuracy test results. -->
+Qwen3-32B-W4A4 from msmodelslim (dynamic)  - Ascend 910B2
+<img width="844" height="79" alt="image" src="https://github.com/user-attachments/assets/58ca29dd-f885-4877-9657-88e6a7541017" />
+
+Qwen3-32B-W8A8 from msmodelslim (static) - Ascend 910B4
+<img width="835" height="78" alt="image" src="https://github.com/user-attachments/assets/9e0ca923-f76e-45e2-bea1-9699af6a0c43" />
+
+Qwen3-32B-W8A8 from msmodelslim (dynamic) - Ascend 910B2
+<img width="836" height="76" alt="image" src="https://github.com/user-attachments/assets/25e5b740-1e4a-449a-9d6b-51cf72e60140" />
+
+Qwen3-30B-W8A8 from msmodelslim (attn - static / mlp - dynamic) - Ascend 910B2
+<img width="847" height="78" alt="image" src="https://github.com/user-attachments/assets/74fc4fde-9e67-4028-8e44-6dcc6faf9ebc" />
+
+server:
+`sysctl -w vm.swappiness=0
+sysctl -w kernel.numa_balancing=0
+sysctl -w kernel.sched_migration_cost_ns=50000
+export SGLANG_SET_CPU_AFFINITY=1
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export STREAMS_PER_DEVICE=32
+export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=32
+export HCCL_BUFFSIZE=1536
+export ENABLE_ASCEND_MOE_NZ=1
+export HCCL_OP_EXPANSION_MODE=AIV
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3`
+`python3 -m sglang.launch_server --model-path *model* --tp 4 --trust-remote-code --attention-backend ascend --device npu  --host 127.0.0.1 --port 30088 --mem-fraction-static 0.8 --quantization modelslim --moe-a2a-backend deepep --deepep-mode auto`
+client: 
+`python ./benchmark/gsm8k/bench_sglang.py --num-questions 1319 --port 30088 --data-path ../gsm8k/test.jsonl --parallel 16`
+
+Qwen3-30B-W8A8 from msmodelslim (attn - static / mlp - dynamic) with EP - Ascend 910C 
+<img width="947" height="80" alt="image" src="https://github.com/user-attachments/assets/7808366f-33d7-4cca-a9fc-e0ce4167f682" />

From 420d6e8ab27cc41ac30db92b481ebe1725b0f9a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 17:08:11 +0300
Subject: [PATCH 057/175] Update README.md

---
 python/sglang/srt/layers/quantization/msmodelslim/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md
index 9eaa9ab248b0..3519a00c64ab 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/README.md
+++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md
@@ -12,6 +12,7 @@ Also MsModelSlim module include:
 - [x] Unit-tests for w4a4 modelslim, w8a8 modelslim
 
 Examples of launch:
+
 server: 
 `SGLANG_SET_CPU_AFFINITY=1
 PYTORCH_NPU_ALLOC_CONF=expandable_segments:True

From f79f9eed1508d8c9fbcd5e12bd493fcd019afb37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 17:08:28 +0300
Subject: [PATCH 058/175] Update README.md

---
 python/sglang/srt/layers/quantization/msmodelslim/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md
index 3519a00c64ab..3eab5da35f55 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/README.md
+++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md
@@ -51,6 +51,7 @@ export ENABLE_ASCEND_MOE_NZ=1
 export HCCL_OP_EXPANSION_MODE=AIV
 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3`
 `python3 -m sglang.launch_server --model-path *model* --tp 4 --trust-remote-code --attention-backend ascend --device npu  --host 127.0.0.1 --port 30088 --mem-fraction-static 0.8 --quantization modelslim --moe-a2a-backend deepep --deepep-mode auto`
+
 client: 
 `python ./benchmark/gsm8k/bench_sglang.py --num-questions 1319 --port 30088 --data-path ../gsm8k/test.jsonl --parallel 16`
 

From a7c43bb1b7244946b39be6fef1ad1e28ffaafd9d Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 18 Dec 2025 17:16:27 +0300
Subject: [PATCH 059/175] Update fused_moe_method_npu.py

1/4 W4A16 refactoring
---
 .../npu/quantization/fused_moe_method_npu.py  | 135 ++----------------
 1 file changed, 12 insertions(+), 123 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index ce17188d71c8..9686ce2b4f5a 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -475,116 +475,8 @@ def apply_without_routing_weights(
 
 class NPUW4A16Int4DynamicMoEMethod(FusedMoEMethodBase):
 
-    def __init__(self, quantization_config) -> None:
-        self.pack_factor = 8  # weight dtype is int4,  but use int32 to create
-        target = (
-            "MoEGMM" if "MoEGMM" in quantization_config.target_scheme_map else "Linear"
-        )
-        if target in quantization_config.target_scheme_map:
-            self.group_size = quantization_config.target_scheme_map[target][
-                "weights"
-            ].group_size
-        else:
-            self.group_size = 128
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ) -> None:
-        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
-
-        self.num_experts = num_experts
-        if (
-            extra_weight_attrs.get(
-                "intermediate_size_full", intermediate_size_per_partition
-            )
-            // intermediate_size_per_partition
-            > 1
-        ):
-            quant_method = FusedMoeWeightScaleSupported.GROUP.value
-        else:
-            quant_method = FusedMoeWeightScaleSupported.CHANNEL.value
-        extra_weight_attrs.update({"quant_method": quant_method})
-        # weight
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size // self.pack_factor,
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition // self.pack_factor,
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-
-        # scale
-        weight_scale_dtype = torch.bfloat16
-        w13_weight_scale = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size // self.group_size,
-                dtype=weight_scale_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-        w2_weight_scale = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition // self.group_size,
-                dtype=weight_scale_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-
-        # offset
-        w13_weight_offset = torch.nn.Parameter(
-            torch.zeros(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size // self.group_size,
-                dtype=weight_scale_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight_offset", w13_weight_offset)
-        set_weight_attrs(w13_weight_offset, extra_weight_attrs)
-
-        w2_weight_offset = torch.nn.Parameter(
-            torch.zeros(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition // self.group_size,
-                dtype=weight_scale_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_offset", w2_weight_offset)
-        set_weight_attrs(w2_weight_offset, extra_weight_attrs)
-
-    def pack_to_int32(self, weight: torch.Tensor):
+    @classmethod
+    def pack_to_int32(cls, weight: torch.Tensor):
         assert weight.dim() == 3
         if weight.dtype == torch.int32:
             # pack 8 int4 to int32, we use a int32 to represent a int4
@@ -605,8 +497,9 @@ def pack_to_int32(self, weight: torch.Tensor):
             raise ValueError(f"{weight.dtype=} is not supported !")
         return new_weight
 
+    @classmethod
     def unpack_from_int32(
-        self,
+        cls,
         value: torch.Tensor,
         num_bits: int,
         shape: torch.Size = None,
@@ -669,7 +562,8 @@ def unpack_from_int32(
 
         return unpacked
 
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+    @classmethod
+    def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
         w13_weight_scale = layer.w13_weight_scale.data.transpose(-1, -2).contiguous()
         w2_weight_scale = layer.w2_weight_scale.data.transpose(-1, -2).contiguous()
         layer.w13_weight_scale = torch.nn.Parameter(
@@ -690,33 +584,28 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # w13_weight = layer.w13_weight.data.transpose(1, 2).contiguous()
         # w2_weight = layer.w2_weight.data.transpose(1, 2).contiguous()
         unpacked_w13_weight = (
-            self.unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4)
+            cls.unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4)
             .view(layer.w13_weight.data.shape[0], layer.w13_weight.data.shape[1], -1)
             .transpose(1, 2)
             .contiguous()
             .int()
         )
         unpacked_w2_weight = (
-            self.unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4)
+            cls.unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4)
             .view(layer.w2_weight.data.shape[0], layer.w2_weight.data.shape[1], -1)
             .transpose(1, 2)
             .contiguous()
             .int()
         )
 
-        w13_weight = self.pack_to_int32(unpacked_w13_weight)
-        w2_weight = self.pack_to_int32(unpacked_w2_weight)
+        w13_weight = cls.pack_to_int32(unpacked_w13_weight)
+        w2_weight = cls.pack_to_int32(unpacked_w2_weight)
 
         layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
         layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
 
-    def create_moe_runner(
-        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
-    ):
-        self.moe_runner_config = moe_runner_config
-
+    @staticmethod
     def apply(
-        self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
@@ -743,8 +632,8 @@ def apply(
         )
         return StandardCombineInput(hidden_states=output)
 
+    @staticmethod
     def apply_without_routing_weights(
-        self,
         layer,
         hidden_states,
         hidden_states_scale,

From ef2fdb839f8e3a584dcb89933ec7aaa635d1b792 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 17:17:05 +0300
Subject: [PATCH 060/175] Update README.md

---
 python/sglang/srt/layers/quantization/msmodelslim/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md
index 3eab5da35f55..27f0680f0a6b 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/README.md
+++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md
@@ -1,5 +1,7 @@
 Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module.
 
+`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be parsed from the downloaded `quant_model_description.json` config.
+
 MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as:
 - [x] W4A4 dynamic linear
 - [x] W8A8 static linear

From cb95c0a3d3344bcee2a9a9e0700b1a799660eaa9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 17:17:41 +0300
Subject: [PATCH 061/175] Update README.md

---
 python/sglang/srt/layers/quantization/msmodelslim/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md
index 27f0680f0a6b..8db5f10d61cf 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/README.md
+++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md
@@ -1,6 +1,6 @@
 Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module.
 
-`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be parsed from the downloaded `quant_model_description.json` config.
+`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config.
 
 MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as:
 - [x] W4A4 dynamic linear

From ca38c591205131689c743a60b775724f65347008 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 18 Dec 2025 17:17:55 +0300
Subject: [PATCH 062/175] Update layer.py

2/4 W4A16 refactoring
---
 python/sglang/srt/layers/moe/ep_moe/layer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py
index 4f22ba798c84..7fbdcff31e85 100644
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -6,8 +6,8 @@
 import torch
 
 from sglang.srt.environ import envs
-from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
-    NPUW4A16Int4DynamicMoEMethod,
+from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import(
+    NPUCompressedTensorsW4A16Int4DynamicMoEMethod,
 )
 from sglang.srt.layers import deep_gemm_wrapper
 from sglang.srt.layers.moe import (
@@ -351,7 +351,7 @@ def forward_npu(
             else:
                 input_quant = get_bool_env_var("DEEP_NORMAL_MODE_USE_INT8_QUANT")
                 if not input_quant and not isinstance(
-                    self.quant_method, NPUW4A16Int4DynamicMoEMethod
+                    self.quant_method, NPUCompressedTensorsW4A16Int4DynamicMoEMethod
                 ):
                     hidden_states, hidden_states_scale = torch_npu.npu_dynamic_quant(
                         hidden_states

From 583cb4d3188c2aabc975da98208fe15a1b5aa0b1 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 18 Dec 2025 17:18:46 +0300
Subject: [PATCH 063/175] Update compressed_tensors.py

3/4 W4A16 refactoring
---
 .../compressed_tensors/compressed_tensors.py  | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index 171573f1b914..56b5f4beb52b 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -449,6 +449,29 @@ def _is_wNa16_group_channel(
 
         return is_channel_group and input_quant_none and is_symmetric and is_static
 
+    def _is_dynamic_token_w4(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+        ) -> bool:
+        is_w4 = weight_quant.num_bits == 4
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+            or weight_quant.strategy == QuantizationStrategy.GROUP.value
+        )
+        if input_quant is not None:
+            is_token = (
+                weight_strategy
+                and input_quant.strategy == QuantizationStrategy.TOKEN.value
+            )
+            is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+        else:
+            is_token = weight_strategy
+            is_dynamic = not weight_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_w4 and weight_quant.symmetric and is_token and is_dynamic
+
     def _get_scheme_from_parts(
         self, weight_quant: BaseModel, input_quant: BaseModel
     ) -> CompressedTensorsScheme:

From 8af003309d3ee19ef6e103598d14ee11918d7409 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 18 Dec 2025 17:20:00 +0300
Subject: [PATCH 064/175] Update compressed_tensors_moe.py

4/4 W4A16 refactoring
---
 .../compressed_tensors_moe.py                 | 167 +++++++++++++++++-
 1 file changed, 160 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 32ed5554736c..daedb41c3c37 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -13,6 +13,7 @@
 
 from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
     NPUW8A8Int8DynamicMoEMethod,
+    NPUW4A16Int4DynamicMoEMethod,
 )
 from sglang.srt.distributed import get_tensor_model_parallel_world_size, get_tp_group
 from sglang.srt.distributed.device_communicators.pynccl_allocator import (
@@ -85,7 +86,9 @@ class GPTQMarlinState(Enum):
     "CompressedTensorsMoEMethod",
     "CompressedTensorsW4A4Nvfp4MoEMethod",
     "CompressedTensorsW8A8Fp8MoEMethod",
-    "NPUCompressedTensorsW8A8Int8MoEMethod" "CompressedTensorsWNA16MoEMethod",
+    "NPUCompressedTensorsW8A8Int8MoEMethod",
+    "CompressedTensorsWNA16MoEMethod",
+    "NPUCompressedTensorsW4A16Int4DynamicMoEMethod"
 ]
 
 
@@ -108,8 +111,13 @@ def get_moe_method(
         input_quant = quant_config.target_scheme_map["Linear"].get("input_activations")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
-            logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
-            return CompressedTensorsWNA16MoEMethod(quant_config)
+            if _is_cuda or _is_hip:
+                logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
+                return CompressedTensorsWNA16MoEMethod(quant_config)
+            elif _is_npu:
+                if quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is None:
+                    logger.info_once("Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod")
+                    return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config)
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
             logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod")
             return CompressedTensorsW4A4Nvfp4MoEMethod(quant_config)
@@ -118,8 +126,8 @@ def get_moe_method(
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
         elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
             if _is_npu:
-                logger.info_once("Using NPUCompressedTensorsW8A8Int8MoEMethod")
-                return NPUCompressedTensorsW8A8Int8MoEMethod(quant_config)
+                logger.info_once("Using NPUCompressedTensorsW8A8Int8DynamicMoEMethod")
+                return NPUCompressedTensorsW8A8Int8DynamicMoEMethod(quant_config)
             else:
                 raise NotImplementedError(
                     f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now."
@@ -866,7 +874,7 @@ def apply(
             return self.runner.run(dispatch_output, quant_info)
 
 
-class NPUCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
+class NPUCompressedTensorsW8A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(self, quant_config: CompressedTensorsConfig):
         self.quant_config = quant_config
@@ -969,7 +977,6 @@ def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
     ):
         self.moe_runner_config = moe_runner_config
-        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
 
     def apply(
         self,
@@ -1286,3 +1293,149 @@ def apply(
             routed_scaling_factor=self.moe_runner_config.routed_scaling_factor,
         )
         return StandardCombineInput(hidden_states=output)
+
+
+class NPUCompressedTensorsW4A16Int4DynamicMoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(self, quantization_config) -> None:
+        self.pack_factor = 8  # weight dtype is int4,  but use int32 to create
+        target = (
+            "MoEGMM" if "MoEGMM" in quantization_config.target_scheme_map else "Linear"
+        )
+        if target in quantization_config.target_scheme_map:
+            self.group_size = quantization_config.target_scheme_map[target][
+                "weights"
+            ].group_size
+        else:
+            self.group_size = 128
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        self.num_experts = num_experts
+        if (
+            extra_weight_attrs.get(
+                "intermediate_size_full", intermediate_size_per_partition
+            )
+            // intermediate_size_per_partition
+            > 1
+        ):
+            quant_method = FusedMoeWeightScaleSupported.GROUP.value
+        else:
+            quant_method = FusedMoeWeightScaleSupported.CHANNEL.value
+        extra_weight_attrs.update({"quant_method": quant_method})
+        # weight
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // self.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // self.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # scale
+        weight_scale_dtype = torch.bfloat16
+        w13_weight_scale = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // self.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        w2_weight_scale = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // self.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # offset
+        w13_weight_offset = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // self.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_offset", w13_weight_offset)
+        set_weight_attrs(w13_weight_offset, extra_weight_attrs)
+
+        w2_weight_offset = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // self.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_offset", w2_weight_offset)
+        set_weight_attrs(w2_weight_offset, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        NPUW4A16Int4DynamicMoEMethod.process_weights_after_loading(layer)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        return NPUW4A16Int4DynamicMoEMethod.apply(layer, dispatch_output)
+
+    def apply_without_routing_weights(
+        self,
+        layer,
+        hidden_states,
+        hidden_states_scale,
+        group_list_type,
+        group_list,
+        output_dtype,
+    ):
+        return NPUW4A16Int4DynamicMoEMethod.apply_without_routing_weights(
+            layer,
+            hidden_states,
+            hidden_states_scale,
+            group_list_type,
+            group_list,
+            output_dtype,
+        )

From 9f8c40708ecac06a833f6f0d11e1a161e24769c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 17:24:23 +0300
Subject: [PATCH 065/175] Quickfix

---
 .../compressed_tensors/compressed_tensors.py  | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index 56b5f4beb52b..d69a222fb836 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -533,33 +533,35 @@ def _get_scheme_from_parts(
                 )
 
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
-                if _is_npu:
-                    return NPUCompressedTensorsW8A8Int8(
-                        strategy=weight_quant.strategy,
-                        is_static_input_scheme=True,
-                        input_symmetric=input_quant.symmetric,
-                    )
-                else:
+                if _is_cuda:
                     return GPUCompressedTensorsW8A8Int8(
                         strategy=weight_quant.strategy,
                         is_static_input_scheme=True,
                         input_symmetric=input_quant.symmetric,
-                    )
+                    )  
+                elif _is_npu:
+                    return NPUCompressedTensorsW8A8Int8(
+                            strategy=weight_quant.strategy,
+                            is_static_input_scheme=True,
+                            input_symmetric=input_quant.symmetric,
+                        )
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-                if _is_npu:
-                    return NPUCompressedTensorsW8A8Int8(
+                if _is_cuda:
+                    return GPUCompressedTensorsW8A8Int8(
                         strategy=weight_quant.strategy,
                         is_static_input_scheme=False,
                         input_symmetric=input_quant.symmetric,
                     )
-                else:
-                    return GPUCompressedTensorsW8A8Int8(
+                elif _is_npu:
+                    return NPUCompressedTensorsW8A8Int8(
                         strategy=weight_quant.strategy,
                         is_static_input_scheme=False,
                         input_symmetric=input_quant.symmetric,
                     )
 
+                    
+
         raise NotImplementedError("No compressed-tensors compatible scheme was found.")
 
     def get_scheme(

From 72efd3ac02f0f5c753966a0dd62d6b9481e02cc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 18:10:23 +0300
Subject: [PATCH 066/175] Update README.md

---
 .../layers/quantization/msmodelslim/README.md | 47 -------------------
 1 file changed, 47 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md
index 8db5f10d61cf..c4a5a8b1f14f 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/README.md
+++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md
@@ -12,50 +12,3 @@ MsModelSlim was developed in the format of compressed_tensors and includes suppo
 Also MsModelSlim module include:
 - [x] Automated config detection for modelslim format (without the need to specify --quantization modelslim flag)
 - [x] Unit-tests for w4a4 modelslim, w8a8 modelslim
-
-Examples of launch:
-
-server: 
-`SGLANG_SET_CPU_AFFINITY=1
-PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
-STREAMS_PER_DEVICE=32
-HCCL_BUFFSIZE=1536
-ENABLE_ASCEND_MOE_NZ=1
-ASCEND_RT_VISIBLE_DEVICES=0,1 
-python3 -m sglang.launch_server --device npu --attention-backend ascend --trust-remote-code --tp-size 2 --model-path *model* --port 30088 --mem-fraction-static 0.8 --cuda-graph-max-bs 16`
-
-client: 
-`python ./benchmark/gsm8k/bench_sglang.py --num-questions 1319 --port 30088 --data-path ../gsm8k/test.jsonl --parallel 16`
-
-<!-- If this pull request affects model outputs (e.g., changes to the kernel or model forward code), provide accuracy test results. -->
-Qwen3-32B-W4A4 from msmodelslim (dynamic)  - Ascend 910B2
-<img width="844" height="79" alt="image" src="https://github.com/user-attachments/assets/58ca29dd-f885-4877-9657-88e6a7541017" />
-
-Qwen3-32B-W8A8 from msmodelslim (static) - Ascend 910B4
-<img width="835" height="78" alt="image" src="https://github.com/user-attachments/assets/9e0ca923-f76e-45e2-bea1-9699af6a0c43" />
-
-Qwen3-32B-W8A8 from msmodelslim (dynamic) - Ascend 910B2
-<img width="836" height="76" alt="image" src="https://github.com/user-attachments/assets/25e5b740-1e4a-449a-9d6b-51cf72e60140" />
-
-Qwen3-30B-W8A8 from msmodelslim (attn - static / mlp - dynamic) - Ascend 910B2
-<img width="847" height="78" alt="image" src="https://github.com/user-attachments/assets/74fc4fde-9e67-4028-8e44-6dcc6faf9ebc" />
-
-server:
-`sysctl -w vm.swappiness=0
-sysctl -w kernel.numa_balancing=0
-sysctl -w kernel.sched_migration_cost_ns=50000
-export SGLANG_SET_CPU_AFFINITY=1
-export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
-export STREAMS_PER_DEVICE=32
-export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=32
-export HCCL_BUFFSIZE=1536
-export ENABLE_ASCEND_MOE_NZ=1
-export HCCL_OP_EXPANSION_MODE=AIV
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3`
-`python3 -m sglang.launch_server --model-path *model* --tp 4 --trust-remote-code --attention-backend ascend --device npu  --host 127.0.0.1 --port 30088 --mem-fraction-static 0.8 --quantization modelslim --moe-a2a-backend deepep --deepep-mode auto`
-
-client: 
-`python ./benchmark/gsm8k/bench_sglang.py --num-questions 1319 --port 30088 --data-path ../gsm8k/test.jsonl --parallel 16`
-
-Qwen3-30B-W8A8 from msmodelslim (attn - static / mlp - dynamic) with EP - Ascend 910C 
-<img width="947" height="80" alt="image" src="https://github.com/user-attachments/assets/7808366f-33d7-4cca-a9fc-e0ce4167f682" />

From 384835b500bbbd3e7557ba9a0ccbbadd3aa6d0cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 18:42:33 +0300
Subject: [PATCH 067/175] Update msmodelslim_moe.py

---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 4b7b596c4f8d..e580fc3df306 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -69,7 +69,6 @@ def __init__(
         self.quant_config = quant_config
         self.group_size = 0
         self.tp_size = 1
-        self.is_per_channel_weight = self.group_size == 0
 
     def create_weights(
         self,
@@ -82,6 +81,7 @@ def create_weights(
     ) -> None:
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
 
+        self.is_per_channel_weight = self.group_size == 0
         self.num_experts = num_experts
         extra_weight_attrs.update(
             {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
@@ -141,9 +141,9 @@ def create_weights(
         )
         layer.register_parameter("w2_weight_offset", w2_weight_offset)
         set_weight_attrs(w2_weight_offset, extra_weight_attrs)
-
+        
+        # >>> special param for w4a8
         if not self.is_per_channel_weight:
-            # >>> special param for w4a8
             w13_weight_scale_second = torch.nn.Parameter(
                 torch.empty(
                     num_experts,

From 4ebfb54f1eba1b01ec1743cf6e333bc23452fd78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 18:43:44 +0300
Subject: [PATCH 068/175] Update fused_moe_method_npu.py

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 9686ce2b4f5a..8598ff650aa8 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -258,8 +258,7 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase):
     @classmethod
     def process_scale(cls, weight: torch.Tensor, scale, per_group_scale):
         scale = scale.transpose(1, 2).contiguous()
-        # if cls.is_per_channel_weight:
-        if True:
+        if cls.is_per_channel_weight:
             scale_np = scale.cpu().numpy()
             scale_np.dtype = np.uint32
             scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu()

From 0cfbd93666437fbfb89ded6b2b914322d1b22af6 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 18 Dec 2025 19:06:03 +0300
Subject: [PATCH 069/175] Create test_ascend_w4a4_quantization.py in srt/ascend

1/4 new CI tests
---
 .../ascend/test_ascend_w4a4_quantization.py   | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 test/srt/ascend/test_ascend_w4a4_quantization.py

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
new file mode 100644
index 000000000000..c2251ec94a9d
--- /dev/null
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -0,0 +1,108 @@
+"""
+Usage:
+python3 -m unittest test_ascend_w4a4_quantization.TestAscendW4A4.test_gsm8k
+"""
+
+import os
+import time
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+    7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
+)
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
+
+
+class TestAscendW4A4(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "/root/.cache/modelscope/hub/models/msit/Qwen3-8B-W4A4/"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--device",
+                "npu",
+                "--attention-backend",
+                "ascend",
+                "--tp-size",
+                "2",
+                "--mem-fraction-static",
+                "0.8",
+                "--cuda-graph-bs",
+                "64",
+                "--disable-radix-cache",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        base_url = DEFAULT_URL_FOR_TEST
+        url = urlparse(base_url)
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=128,
+            max_new_tokens=512,
+            parallel=64,
+            host=f"http://{url.hostname}",
+            port=int(url.port),
+        )
+        metrics = run_eval(args)
+        print(metrics)
+
+        self.assertGreaterEqual(metrics["accuracy"], 0.75)
+        self.assertGreaterEqual(metrics["output_throughput"], 700)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+                "ignore_eos": True,
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        max_tokens = 256
+
+        tic = time.perf_counter()
+        res = self.run_decode(max_tokens)
+        tok = time.perf_counter()
+        print(res["text"])
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+
+        if is_in_ci():
+            self.assertGreaterEqual(throughput, 25)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 87b65a820c49409f351d9c1c21f147f240cca5a5 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 18 Dec 2025 19:06:31 +0300
Subject: [PATCH 070/175] Delete
 test/manual/ascend/test_ascend_w4a4_quantization.py

2/4 new CI tests
---
 .../ascend/test_ascend_w4a4_quantization.py   | 108 ------------------
 1 file changed, 108 deletions(-)
 delete mode 100644 test/manual/ascend/test_ascend_w4a4_quantization.py

diff --git a/test/manual/ascend/test_ascend_w4a4_quantization.py b/test/manual/ascend/test_ascend_w4a4_quantization.py
deleted file mode 100644
index c2251ec94a9d..000000000000
--- a/test/manual/ascend/test_ascend_w4a4_quantization.py
+++ /dev/null
@@ -1,108 +0,0 @@
-"""
-Usage:
-python3 -m unittest test_ascend_w4a4_quantization.TestAscendW4A4.test_gsm8k
-"""
-
-import os
-import time
-import unittest
-from types import SimpleNamespace
-from urllib.parse import urlparse
-
-import requests
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.few_shot_gsm8k import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    CustomTestCase,
-    is_in_ci,
-    popen_launch_server,
-)
-
-if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
-    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
-DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
-    7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
-)
-DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
-
-
-class TestAscendW4A4(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "/root/.cache/modelscope/hub/models/msit/Qwen3-8B-W4A4/"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--device",
-                "npu",
-                "--attention-backend",
-                "ascend",
-                "--tp-size",
-                "2",
-                "--mem-fraction-static",
-                "0.8",
-                "--cuda-graph-bs",
-                "64",
-                "--disable-radix-cache",
-            ],
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_gsm8k(self):
-        base_url = DEFAULT_URL_FOR_TEST
-        url = urlparse(base_url)
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=128,
-            max_new_tokens=512,
-            parallel=64,
-            host=f"http://{url.hostname}",
-            port=int(url.port),
-        )
-        metrics = run_eval(args)
-        print(metrics)
-
-        self.assertGreaterEqual(metrics["accuracy"], 0.75)
-        self.assertGreaterEqual(metrics["output_throughput"], 700)
-
-    def run_decode(self, max_new_tokens):
-        response = requests.post(
-            self.base_url + "/generate",
-            json={
-                "text": "The capital of France is",
-                "sampling_params": {
-                    "temperature": 0,
-                    "max_new_tokens": max_new_tokens,
-                },
-                "ignore_eos": True,
-            },
-        )
-        return response.json()
-
-    def test_throughput(self):
-        max_tokens = 256
-
-        tic = time.perf_counter()
-        res = self.run_decode(max_tokens)
-        tok = time.perf_counter()
-        print(res["text"])
-        throughput = max_tokens / (tok - tic)
-        print(f"Throughput: {throughput} tokens/s")
-
-        if is_in_ci():
-            self.assertGreaterEqual(throughput, 25)
-
-
-if __name__ == "__main__":
-    unittest.main()

From 177102dc660f38f78cf132bc42b38050919c02ad Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 18 Dec 2025 19:07:53 +0300
Subject: [PATCH 071/175] Create test_ascend_w8a8_quantization.py

3/4 new CI tests
---
 .../ascend/test_ascend_w8a8_quantization.py   | 103 ++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 test/srt/ascend/test_ascend_w8a8_quantization.py

diff --git a/test/srt/ascend/test_ascend_w8a8_quantization.py b/test/srt/ascend/test_ascend_w8a8_quantization.py
new file mode 100644
index 000000000000..f3f9cdff952b
--- /dev/null
+++ b/test/srt/ascend/test_ascend_w8a8_quantization.py
@@ -0,0 +1,103 @@
+"""
+Usage:
+python3 -m unittest test_ascend_w8a8_quantization.TestAscendW8A8.test_gsm8k
+"""
+
+import os
+import time
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+    7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
+)
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
+
+
+class TestAscendW8A8CompressedTensors(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        # TODO: Move model to CI or Modelscope
+        cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--disable-cuda-graph",
+                "--device",
+                "npu",
+                "--attention-backend",
+                "ascend",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        base_url = DEFAULT_URL_FOR_TEST
+        url = urlparse(base_url)
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host=f"http://{url.hostname}",
+            port=int(url.port),
+        )
+        metrics = run_eval(args)
+        print(metrics)
+
+        self.assertGreaterEqual(metrics["accuracy"], 0.3)
+        self.assertGreaterEqual(metrics["output_throughput"], 1000)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+                "ignore_eos": True,
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        max_tokens = 256
+
+        tic = time.perf_counter()
+        res = self.run_decode(max_tokens)
+        tok = time.perf_counter()
+        print(res["text"])
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+
+        if is_in_ci():
+            self.assertGreaterEqual(throughput, 25)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 16ca7733ae6825d0d5337a7692b089bd478f908e Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 18 Dec 2025 19:08:41 +0300
Subject: [PATCH 072/175] Update run_suite.py

4/4 new CI tests
---
 test/srt/run_suite.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 541d59b10901..89391e42aca1 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -361,12 +361,14 @@
         TestFile("ascend/test_ascend_sampling_backend.py", 400),
         TestFile("ascend/test_ascend_tp1_bf16.py", 400),
         TestFile("ascend/test_ascend_compile_graph_tp1_bf16.py", 400),
+        TestFile("ascend/test_ascend_w8a8_quantization.py", 400),
     ],
     "per-commit-2-npu-a2": [
         TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400),
         TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400),
         TestFile("ascend/test_ascend_tp2_bf16.py", 400),
         TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400),
+        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
     ],
     "per-commit-4-npu-a2": [
         TestFile("ascend/test_ascend_mla_w8a8int8.py", 400),

From c6def39082c0ad45cbeeae89cdacd4c4b37a9cda Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 18 Dec 2025 19:11:10 +0300
Subject: [PATCH 073/175] Update test_ascend_w8a8_quantization.py

Remove compressed-tensors test, remove quantization flag
---
 .../ascend/test_ascend_w8a8_quantization.py   | 72 -------------------
 1 file changed, 72 deletions(-)

diff --git a/test/manual/ascend/test_ascend_w8a8_quantization.py b/test/manual/ascend/test_ascend_w8a8_quantization.py
index 959bf88a513f..e013c150c314 100644
--- a/test/manual/ascend/test_ascend_w8a8_quantization.py
+++ b/test/manual/ascend/test_ascend_w8a8_quantization.py
@@ -45,8 +45,6 @@ def setUpClass(cls):
                 "npu",
                 "--attention-backend",
                 "ascend",
-                "--quantization",
-                "w8a8_int8",
             ],
         )
 
@@ -100,75 +98,5 @@ def test_throughput(self):
             self.assertGreaterEqual(throughput, 25)
 
 
-class TestAscendW8A8CompressedTensors(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        # TODO: Move model to CI or Modelscope
-        cls.model = "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--disable-cuda-graph",
-                "--device",
-                "npu",
-                "--attention-backend",
-                "ascend",
-            ],
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_gsm8k(self):
-        base_url = DEFAULT_URL_FOR_TEST
-        url = urlparse(base_url)
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=200,
-            max_new_tokens=512,
-            parallel=128,
-            host=f"http://{url.hostname}",
-            port=int(url.port),
-        )
-        metrics = run_eval(args)
-        print(metrics)
-
-        self.assertGreaterEqual(metrics["accuracy"], 0.3)
-        self.assertGreaterEqual(metrics["output_throughput"], 1000)
-
-    def run_decode(self, max_new_tokens):
-        response = requests.post(
-            self.base_url + "/generate",
-            json={
-                "text": "The capital of France is",
-                "sampling_params": {
-                    "temperature": 0,
-                    "max_new_tokens": max_new_tokens,
-                },
-                "ignore_eos": True,
-            },
-        )
-        return response.json()
-
-    def test_throughput(self):
-        max_tokens = 256
-
-        tic = time.perf_counter()
-        res = self.run_decode(max_tokens)
-        tok = time.perf_counter()
-        print(res["text"])
-        throughput = max_tokens / (tok - tic)
-        print(f"Throughput: {throughput} tokens/s")
-
-        if is_in_ci():
-            self.assertGreaterEqual(throughput, 25)
-
-
 if __name__ == "__main__":
     unittest.main()

From d0dd42766f450070efae9a12c1c786e971261eee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 20:12:47 +0300
Subject: [PATCH 074/175] Create ascend_npu_quantization.md

---
 docs/platforms/ascend_npu_quantization.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 docs/platforms/ascend_npu_quantization.md

diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md
new file mode 100644
index 000000000000..053e12777b0b
--- /dev/null
+++ b/docs/platforms/ascend_npu_quantization.md
@@ -0,0 +1,19 @@
+Quantization on Ascend.
+
+To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` or `config.json` config.
+
+MsModelSlim on Ascend support:
+- [x] W4A4 dynamic linear
+- [x] W8A8 static linear
+- [x] W8A8 dynamic linear 
+- [x] W4A8 dynamic MOE
+- [x] W8A8 dynamic MOE
+
+AWQ on Ascend support:
+- [x] W4A16 linear
+- [x] W8A16 MOE
+      
+Compressed-tensors (LLM Compressor) on Ascend support:
+- [x] W8A8 dynamic linear 
+- [x] W8A8 dynamic MOE
+- [x] W4A16 MOE

From 2e1219fe9a18ec3efa84289d10ef3495db427aed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 20:16:21 +0300
Subject: [PATCH 075/175] Bugfix

---
 .../quantization/compressed_tensors/compressed_tensors.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index d69a222fb836..107ec12a8b11 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -54,11 +54,11 @@
 )
 from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
-from sglang.srt.utils import is_npu
+from sglang.srt.utils import is_cuda, is_npu
 
+_is_cuda = is_cuda()
 _is_npu = is_npu()
 
-
 if TYPE_CHECKING:
     from sglang.srt.models.utils import WeightsMapper
 

From 9d6ffbd6e68aad1a83d038a0bc972a9ea7e486ef Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Thu, 18 Dec 2025 20:17:20 +0300
Subject: [PATCH 076/175] Pre-commit fixes

---
 docs/platforms/ascend_npu_quantization.md     |  6 +++---
 .../npu/quantization/fused_moe_method_npu.py  |  2 --
 python/sglang/srt/layers/moe/ep_moe/layer.py  |  6 +++---
 .../compressed_tensors/compressed_tensors.py  | 14 ++++++--------
 .../compressed_tensors_moe.py                 | 19 ++++++++++++-------
 .../layers/quantization/msmodelslim/README.md |  2 +-
 .../msmodelslim/msmodelslim_moe.py            |  2 +-
 .../schemes/msmodelslim_w8a8_int8.py          |  4 +++-
 8 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md
index 053e12777b0b..08623f6ffba3 100644
--- a/docs/platforms/ascend_npu_quantization.md
+++ b/docs/platforms/ascend_npu_quantization.md
@@ -5,15 +5,15 @@ To load already quantized models, simply load the model weights and config. Agai
 MsModelSlim on Ascend support:
 - [x] W4A4 dynamic linear
 - [x] W8A8 static linear
-- [x] W8A8 dynamic linear 
+- [x] W8A8 dynamic linear
 - [x] W4A8 dynamic MOE
 - [x] W8A8 dynamic MOE
 
 AWQ on Ascend support:
 - [x] W4A16 linear
 - [x] W8A16 MOE
-      
+
 Compressed-tensors (LLM Compressor) on Ascend support:
-- [x] W8A8 dynamic linear 
+- [x] W8A8 dynamic linear
 - [x] W8A8 dynamic MOE
 - [x] W4A16 MOE
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 8598ff650aa8..17da9aaeadea 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -5,10 +5,8 @@
 
 from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
-from sglang.srt.utils import set_weight_attrs
 
 if TYPE_CHECKING:
-    from sglang.srt.layers.moe import MoeRunnerConfig
     from sglang.srt.layers.moe.token_dispatcher import (
         CombineInput,
         StandardDispatchOutput,
diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py
index 7fbdcff31e85..001da6e849a0 100644
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -6,9 +6,6 @@
 import torch
 
 from sglang.srt.environ import envs
-from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import(
-    NPUCompressedTensorsW4A16Int4DynamicMoEMethod,
-)
 from sglang.srt.layers import deep_gemm_wrapper
 from sglang.srt.layers.moe import (
     get_deepep_mode,
@@ -22,6 +19,9 @@
 )
 from sglang.srt.layers.moe.topk import TopKOutput
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+    NPUCompressedTensorsW4A16Int4DynamicMoEMethod,
+)
 from sglang.srt.layers.quantization.fp8 import Fp8Config
 from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
 from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index 107ec12a8b11..c515018a100c 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -451,7 +451,7 @@ def _is_wNa16_group_channel(
 
     def _is_dynamic_token_w4(
         self, weight_quant: BaseModel, input_quant: BaseModel
-        ) -> bool:
+    ) -> bool:
         is_w4 = weight_quant.num_bits == 4
         weight_strategy = (
             weight_quant.strategy == QuantizationStrategy.TENSOR.value
@@ -538,13 +538,13 @@ def _get_scheme_from_parts(
                         strategy=weight_quant.strategy,
                         is_static_input_scheme=True,
                         input_symmetric=input_quant.symmetric,
-                    )  
+                    )
                 elif _is_npu:
                     return NPUCompressedTensorsW8A8Int8(
-                            strategy=weight_quant.strategy,
-                            is_static_input_scheme=True,
-                            input_symmetric=input_quant.symmetric,
-                        )
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=True,
+                        input_symmetric=input_quant.symmetric,
+                    )
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
                 if _is_cuda:
@@ -560,8 +560,6 @@ def _get_scheme_from_parts(
                         input_symmetric=input_quant.symmetric,
                     )
 
-                    
-
         raise NotImplementedError("No compressed-tensors compatible scheme was found.")
 
     def get_scheme(
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index daedb41c3c37..157adfd5b88e 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -11,14 +11,14 @@
 from compressed_tensors import CompressionFormat
 from compressed_tensors.quantization import QuantizationStrategy
 
-from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
-    NPUW8A8Int8DynamicMoEMethod,
-    NPUW4A16Int4DynamicMoEMethod,
-)
 from sglang.srt.distributed import get_tensor_model_parallel_world_size, get_tp_group
 from sglang.srt.distributed.device_communicators.pynccl_allocator import (
     use_symmetric_memory,
 )
+from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+    NPUW4A16Int4DynamicMoEMethod,
+    NPUW8A8Int8DynamicMoEMethod,
+)
 from sglang.srt.layers.dp_attention import is_allocation_symmetric
 from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
 from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
@@ -88,7 +88,7 @@ class GPTQMarlinState(Enum):
     "CompressedTensorsW8A8Fp8MoEMethod",
     "NPUCompressedTensorsW8A8Int8MoEMethod",
     "CompressedTensorsWNA16MoEMethod",
-    "NPUCompressedTensorsW4A16Int4DynamicMoEMethod"
+    "NPUCompressedTensorsW4A16Int4DynamicMoEMethod",
 ]
 
 
@@ -115,8 +115,13 @@ def get_moe_method(
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MoEMethod(quant_config)
             elif _is_npu:
-                if quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is None:
-                    logger.info_once("Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod")
+                if (
+                    quant_config._is_dynamic_token_w4(weight_quant, input_quant)
+                    and input_quant is None
+                ):
+                    logger.info_once(
+                        "Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod"
+                    )
                     return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config)
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
             logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod")
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md
index c4a5a8b1f14f..d02d8f3b028f 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/README.md
+++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md
@@ -5,7 +5,7 @@ Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelsl
 MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as:
 - [x] W4A4 dynamic linear
 - [x] W8A8 static linear
-- [x] W8A8 dynamic linear 
+- [x] W8A8 dynamic linear
 - [x] W4A8 dynamic MOE
 - [x] W8A8 dynamic MOE
 
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index e580fc3df306..e7d7f6c3c745 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -141,7 +141,7 @@ def create_weights(
         )
         layer.register_parameter("w2_weight_offset", w2_weight_offset)
         set_weight_attrs(w2_weight_offset, extra_weight_attrs)
-        
+
         # >>> special param for w4a8
         if not self.is_per_channel_weight:
             w13_weight_scale_second = torch.nn.Parameter(
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
index c462b2a66bea..8250c7c4c576 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
@@ -25,7 +25,9 @@ def __init__(
         prefix: str,
     ):
         self.quant_config = quant_config
-        self.is_dynamic = self.quant_config.get(prefix + ".weight", "") == "W8A8_DYNAMIC"
+        self.is_dynamic = (
+            self.quant_config.get(prefix + ".weight", "") == "W8A8_DYNAMIC"
+        )
 
     def create_weights(
         self,

From 17a62487eff94cd4144a14bc7e58dd1cf1fcc0b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 20:19:43 +0300
Subject: [PATCH 077/175] Update fused_moe_method_npu.py

---
 .../npu/quantization/fused_moe_method_npu.py               | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 17da9aaeadea..1e6ee311cd42 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -256,11 +256,14 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase):
     @classmethod
     def process_scale(cls, weight: torch.Tensor, scale, per_group_scale):
         scale = scale.transpose(1, 2).contiguous()
-        if cls.is_per_channel_weight:
+
+        ### TODO fix group_size=0 behaivor
+        '''if cls.is_per_channel_weight:
             scale_np = scale.cpu().numpy()
             scale_np.dtype = np.uint32
             scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu()
-            return scale_uint64_tensor, None
+            return scale_uint64_tensor, None'''
+        
         per_group_scale = per_group_scale.transpose(1, 2).contiguous()
         group_num, k, n = weight.shape
         # the weight of the new version is reduced by half by pack n, so it needs to be restored

From 0bf3389d8b1a0bd3df0f867c312b273fd6787013 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 20:23:10 +0300
Subject: [PATCH 078/175] Fix missprint

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 1e6ee311cd42..fb39bca777f0 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -257,7 +257,7 @@ class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase):
     def process_scale(cls, weight: torch.Tensor, scale, per_group_scale):
         scale = scale.transpose(1, 2).contiguous()
 
-        ### TODO fix group_size=0 behaivor
+        ### TODO fix group_size=0 behavior
         '''if cls.is_per_channel_weight:
             scale_np = scale.cpu().numpy()
             scale_np.dtype = np.uint32

From 1d2815795d759aa7f72fb8d1a8c9fc477ef9d059 Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Thu, 18 Dec 2025 20:28:51 +0300
Subject: [PATCH 079/175] Pre-commit fixes

---
 .../npu/quantization/fused_moe_method_npu.py                | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index fb39bca777f0..0c9a6940ec0a 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -258,12 +258,12 @@ def process_scale(cls, weight: torch.Tensor, scale, per_group_scale):
         scale = scale.transpose(1, 2).contiguous()
 
         ### TODO fix group_size=0 behavior
-        '''if cls.is_per_channel_weight:
+        """if cls.is_per_channel_weight:
             scale_np = scale.cpu().numpy()
             scale_np.dtype = np.uint32
             scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu()
-            return scale_uint64_tensor, None'''
-        
+            return scale_uint64_tensor, None"""
+
         per_group_scale = per_group_scale.transpose(1, 2).contiguous()
         group_num, k, n = weight.shape
         # the weight of the new version is reduced by half by pack n, so it needs to be restored

From a5b88e9b9861df1dad0b31d417d5d22dbcace79a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 22:42:55 +0300
Subject: [PATCH 080/175] Update ascend_npu_quantization.md

---
 docs/platforms/ascend_npu_quantization.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md
index 08623f6ffba3..8f663a9a60af 100644
--- a/docs/platforms/ascend_npu_quantization.md
+++ b/docs/platforms/ascend_npu_quantization.md
@@ -9,9 +9,11 @@ MsModelSlim on Ascend support:
 - [x] W4A8 dynamic MOE
 - [x] W8A8 dynamic MOE
 
-AWQ on Ascend support:
+AWQ on Ascend support: 
 - [x] W4A16 linear
-- [x] W8A16 MOE
+- [x] W8A16 linear # Test required
+- [x] W4A16 MOE # Test required
+- [x] W8A16 MOE # Test required
 
 Compressed-tensors (LLM Compressor) on Ascend support:
 - [x] W8A8 dynamic linear

From 30f7b10bdc6c04c3d7fc54edaccacaed1ea453f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 18 Dec 2025 22:44:55 +0300
Subject: [PATCH 081/175] Update ascend_npu_quantization.md

---
 docs/platforms/ascend_npu_quantization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md
index 8f663a9a60af..860ad950f259 100644
--- a/docs/platforms/ascend_npu_quantization.md
+++ b/docs/platforms/ascend_npu_quantization.md
@@ -9,7 +9,7 @@ MsModelSlim on Ascend support:
 - [x] W4A8 dynamic MOE
 - [x] W8A8 dynamic MOE
 
-AWQ on Ascend support: 
+AWQ on Ascend support:
 - [x] W4A16 linear
 - [x] W8A16 linear # Test required
 - [x] W4A16 MOE # Test required

From 22c85ce1d467c72d552bf776104ef1a567622972 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Fri, 19 Dec 2025 10:56:36 +0300
Subject: [PATCH 082/175] Update python/sglang/srt/configs/model_config.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 python/sglang/srt/configs/model_config.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index e9f479f4a33b..c989d9fddf92 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -732,11 +732,11 @@ def _verify_quantization(self) -> None:
 
         # Filter out None values
         cfg_list = [item for item in cfg_list if item is not None]
-        assert (
-            len(cfg_list) == 1
-        ), "Config list contains configs from 2 methods, must be only 1"
-
-        quant_cfg = cfg_list[0]
+        if len(cfg_list) > 1:
+            raise ValueError(
+                "Config list contains configs from 2 methods, must be only 1"
+            )
+        quant_cfg = cfg_list[0] if cfg_list else None
 
         if quant_cfg is not None:
             quant_method = quant_cfg.get(

From 21b9219b8b5693a80817386bde9051e024daf67c Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Fri, 19 Dec 2025 12:48:41 +0300
Subject: [PATCH 083/175] Update compressed_tensors.py

Review fix 1/5
---
 .../compressed_tensors/compressed_tensors.py       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index c515018a100c..8a41fe3fa7b1 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -44,7 +44,7 @@
     CompressedTensorsW8A8Fp8,
     CompressedTensorsW8A16Fp8,
     CompressedTensorsWNA16,
-    GPUCompressedTensorsW8A8Int8,
+    CompressedTensorsW8A8Int8,
     NPUCompressedTensorsW8A8Int8,
 )
 from sglang.srt.layers.quantization.compressed_tensors.utils import (
@@ -533,13 +533,13 @@ def _get_scheme_from_parts(
                 )
 
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
-                if _is_cuda:
-                    return GPUCompressedTensorsW8A8Int8(
+                if not _is_npu:
+                    return CompressedTensorsW8A8Int8(
                         strategy=weight_quant.strategy,
                         is_static_input_scheme=True,
                         input_symmetric=input_quant.symmetric,
                     )
-                elif _is_npu:
+                else:
                     return NPUCompressedTensorsW8A8Int8(
                         strategy=weight_quant.strategy,
                         is_static_input_scheme=True,
@@ -547,13 +547,13 @@ def _get_scheme_from_parts(
                     )
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-                if _is_cuda:
-                    return GPUCompressedTensorsW8A8Int8(
+                if not _is_npu:
+                    return CompressedTensorsW8A8Int8(
                         strategy=weight_quant.strategy,
                         is_static_input_scheme=False,
                         input_symmetric=input_quant.symmetric,
                     )
-                elif _is_npu:
+                else:
                     return NPUCompressedTensorsW8A8Int8(
                         strategy=weight_quant.strategy,
                         is_static_input_scheme=False,

From 52b10881814d670ae2fd3f083244801fabb29266 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Fri, 19 Dec 2025 12:49:10 +0300
Subject: [PATCH 084/175] Update compressed_tensors_moe.py

Review fix 2/5
---
 .../compressed_tensors_moe.py                 | 21 +++++++------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 157adfd5b88e..5cd2ec12792a 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -111,17 +111,12 @@ def get_moe_method(
         input_quant = quant_config.target_scheme_map["Linear"].get("input_activations")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
-            if _is_cuda or _is_hip:
+            if not _is_npu:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MoEMethod(quant_config)
-            elif _is_npu:
-                if (
-                    quant_config._is_dynamic_token_w4(weight_quant, input_quant)
-                    and input_quant is None
-                ):
-                    logger.info_once(
-                        "Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod"
-                    )
+            else:
+                if quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is None:
+                    logger.info_once("Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod")
                     return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config)
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
             logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod")
@@ -887,10 +882,7 @@ def __init__(self, quant_config: CompressedTensorsConfig):
         self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
             "input_activations"
         )
-        if not _is_npu:
-            raise NotImplementedError(
-                "w8a8 int8 compressed tensors moe scheme is supported only for Ascend device for now."
-            )
+
         self.static_input_scales = not self.input_quant.dynamic
         per_channel = (
             self.weight_quant.strategy == QuantizationStrategy.CHANNEL
@@ -1314,6 +1306,9 @@ def __init__(self, quantization_config) -> None:
         else:
             self.group_size = 128
 
+    # TODO: See if we can merge this method's logic
+    # with CompressedTensorsWNA16MoEMethod. Need more models and tests.
+    # @OrangeRedeng @TamirBaydasov
     def create_weights(
         self,
         layer: torch.nn.Module,

From 2a5f7457ca840c05e9595499fa2c1935fdab4c18 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Fri, 19 Dec 2025 12:49:49 +0300
Subject: [PATCH 085/175] Update __init__.py

Review fix 3/5
---
 .../quantization/compressed_tensors/schemes/__init__.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py
index baf528fea204..70ca328c8a91 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -4,7 +4,7 @@
 from .compressed_tensors_w4a4_nvfp4 import CompressedTensorsW4A4Fp4
 from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
 from .compressed_tensors_w8a8_int8 import (
-    GPUCompressedTensorsW8A8Int8,
+    CompressedTensorsW8A8Int8,
     NPUCompressedTensorsW8A8Int8,
 )
 from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
@@ -14,7 +14,7 @@
     "CompressedTensorsScheme",
     "CompressedTensorsW8A8Fp8",
     "CompressedTensorsW8A16Fp8",
-    "GPUCompressedTensorsW8A8Int8",
+    "CompressedTensorsW8A8Int8",
     "NPUCompressedTensorsW8A8Int8",
     "CompressedTensorsWNA16",
     "WNA16_SUPPORTED_BITS",

From 309e5efdb3db2389dfbc44b8c35ac4f1fbd86189 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Fri, 19 Dec 2025 12:50:23 +0300
Subject: [PATCH 086/175] Update compressed_tensors_w8a8_int8.py

Review fix 4/5
---
 .../schemes/compressed_tensors_w8a8_int8.py               | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index d307f6b01c33..6db89e9f1ac2 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -91,14 +91,6 @@ def create_weights(
                 )
                 layer.register_parameter("input_zero_point", input_zero_point)
 
-
-class GPUCompressedTensorsW8A8Int8(CompressedTensorsW8A8Int8):
-
-    def __init__(
-        self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
-    ):
-        super().__init__(strategy, is_static_input_scheme, input_symmetric)
-
     @classmethod
     def get_min_capability(cls) -> int:
         # ampere and up

From 611546d32cf8326861c93e1b17beedc9765289db Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Fri, 19 Dec 2025 12:51:27 +0300
Subject: [PATCH 087/175] Update README.md

Review fix 5/5
---
 python/sglang/srt/layers/quantization/msmodelslim/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md
index d02d8f3b028f..65f5eb029323 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/README.md
+++ b/python/sglang/srt/layers/quantization/msmodelslim/README.md
@@ -1,6 +1,6 @@
 Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module.
 
-`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config.
+`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights. For models quantized with MSModelSlim, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config.
 
 MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as:
 - [x] W4A4 dynamic linear

From d2888fdd86bc9c3bd240bda1ca2b84b83d2b7a90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 19 Dec 2025 14:48:49 +0300
Subject: [PATCH 088/175] Update linear_method_npu.py

---
 .../srt/hardware_backend/npu/quantization/linear_method_npu.py   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
index 2d70834caf0b..6ab0d35652d1 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
@@ -134,7 +134,6 @@ def apply(
     def process_weights_after_loading(layer):
         layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
         layer.weight_scale.data = layer.weight_scale.data.flatten()
-        layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32)
         layer.weight_offset.data = layer.weight_offset.data.flatten()
         layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack(
             layer.weight.data.to(torch.int32)

From 554027a6db0b305286e5c8bdb412a83ca333ce05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 19 Dec 2025 15:02:34 +0300
Subject: [PATCH 089/175] Fix group_size

---
 .../npu/quantization/fused_moe_method_npu.py        | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 0c9a6940ec0a..13d72581ff81 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -254,15 +254,14 @@ def apply_without_routing_weights(
 class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase):
 
     @classmethod
-    def process_scale(cls, weight: torch.Tensor, scale, per_group_scale):
+    def process_scale(cls, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight):
         scale = scale.transpose(1, 2).contiguous()
 
-        ### TODO fix group_size=0 behavior
-        """if cls.is_per_channel_weight:
+        if is_per_channel_weight:
             scale_np = scale.cpu().numpy()
             scale_np.dtype = np.uint32
             scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu()
-            return scale_uint64_tensor, None"""
+            return scale_uint64_tensor, None
 
         per_group_scale = per_group_scale.transpose(1, 2).contiguous()
         group_num, k, n = weight.shape
@@ -306,7 +305,7 @@ def pack_to_int32(cls, weight: torch.Tensor):
         return weight.view(torch.int32).contiguous()
 
     @classmethod
-    def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
+    def process_weights_after_loading(cls, layer: torch.nn.Module, is_per_channel_weight) -> None:
         layer.w13_weight = torch.nn.Parameter(
             layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False
         )
@@ -325,10 +324,10 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
             else None
         )
         layer.w13_weight_scale.data, w13_bias = cls.process_scale(
-            layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second
+            layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second, is_per_channel_weight
         )
         layer.w2_weight_scale.data, w2_bias = cls.process_scale(
-            layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second
+            layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second, is_per_channel_weight
         )
         if hasattr(layer, "w13_weight_scale_second"):
             # scale_second is no longer used, release this part of the memory

From ad52cda6d3882cc9940916168d3985be4b81590a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 19 Dec 2025 15:03:26 +0300
Subject: [PATCH 090/175] Fix group_size

---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index e7d7f6c3c745..2729c8e0d477 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -212,7 +212,7 @@ def create_weights(
         set_weight_attrs(w2_scale_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(layer)
+        NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(layer, self.is_per_channel_weight)
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"

From 1d0eddb200441f03627ff9e5153f8592b04ab1d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 19 Dec 2025 15:29:20 +0300
Subject: [PATCH 091/175] Update fused_moe_method_npu.py

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 13d72581ff81..830d5ca2fa11 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -362,8 +362,8 @@ def apply(
         num_tokens = hidden_states.shape[:-1].numel()
 
         first_expert_idx = 0
-        last_expert_idx = 128
-        global_num_experts = 128
+        last_expert_idx = layer.num_experts
+        global_num_experts = layer.num_experts
 
         sorted_hidden_states, expanded_row_idx, expert_tokens, pertoken_scale = (
             torch.ops.npu.npu_moe_init_routing_v2(

From c2e972fdb8a5440b6077898bd1587b42a8554472 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 22 Dec 2025 10:31:50 +0300
Subject: [PATCH 092/175] Update fused_moe_method_npu.py

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 830d5ca2fa11..3c4063b21fe7 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -384,7 +384,6 @@ def apply(
         bias2 = [layer.w2_scale_bias]
         w1_scale = [layer.w13_weight_scale]
         w2_scale = [layer.w2_weight_scale]
-        # TODO w4a8 scene: dynamic acquisition of dtype in the future
         _output_dtype = torch.bfloat16
 
         hidden_states = torch.ops.npu.npu_grouped_matmul(

From 3bc7fafd905dca83fc8a69d2ce60edc75fcea2d9 Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Mon, 22 Dec 2025 10:32:26 +0300
Subject: [PATCH 093/175] Pre-commit fixes

---
 .../npu/quantization/fused_moe_method_npu.py   | 18 ++++++++++++++----
 .../compressed_tensors/compressed_tensors.py   |  2 +-
 .../compressed_tensors_moe.py                  |  9 +++++++--
 .../msmodelslim/msmodelslim_moe.py             |  4 +++-
 4 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 3c4063b21fe7..4a7f2e22845d 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -254,7 +254,9 @@ def apply_without_routing_weights(
 class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase):
 
     @classmethod
-    def process_scale(cls, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight):
+    def process_scale(
+        cls, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight
+    ):
         scale = scale.transpose(1, 2).contiguous()
 
         if is_per_channel_weight:
@@ -305,7 +307,9 @@ def pack_to_int32(cls, weight: torch.Tensor):
         return weight.view(torch.int32).contiguous()
 
     @classmethod
-    def process_weights_after_loading(cls, layer: torch.nn.Module, is_per_channel_weight) -> None:
+    def process_weights_after_loading(
+        cls, layer: torch.nn.Module, is_per_channel_weight
+    ) -> None:
         layer.w13_weight = torch.nn.Parameter(
             layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False
         )
@@ -324,10 +328,16 @@ def process_weights_after_loading(cls, layer: torch.nn.Module, is_per_channel_we
             else None
         )
         layer.w13_weight_scale.data, w13_bias = cls.process_scale(
-            layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second, is_per_channel_weight
+            layer.w13_weight,
+            layer.w13_weight_scale.data,
+            w13_weight_scale_second,
+            is_per_channel_weight,
         )
         layer.w2_weight_scale.data, w2_bias = cls.process_scale(
-            layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second, is_per_channel_weight
+            layer.w2_weight,
+            layer.w2_weight_scale.data,
+            w2_weight_scale_second,
+            is_per_channel_weight,
         )
         if hasattr(layer, "w13_weight_scale_second"):
             # scale_second is no longer used, release this part of the memory
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index 8a41fe3fa7b1..0ed642950fbc 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -42,9 +42,9 @@
     CompressedTensorsScheme,
     CompressedTensorsW4A4Fp4,
     CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8,
     CompressedTensorsWNA16,
-    CompressedTensorsW8A8Int8,
     NPUCompressedTensorsW8A8Int8,
 )
 from sglang.srt.layers.quantization.compressed_tensors.utils import (
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 5cd2ec12792a..39d53e88ee3f 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -115,8 +115,13 @@ def get_moe_method(
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MoEMethod(quant_config)
             else:
-                if quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is None:
-                    logger.info_once("Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod")
+                if (
+                    quant_config._is_dynamic_token_w4(weight_quant, input_quant)
+                    and input_quant is None
+                ):
+                    logger.info_once(
+                        "Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod"
+                    )
                     return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config)
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
             logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod")
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 2729c8e0d477..5cec89a39773 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -212,7 +212,9 @@ def create_weights(
         set_weight_attrs(w2_scale_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(layer, self.is_per_channel_weight)
+        NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(
+            layer, self.is_per_channel_weight
+        )
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"

From ff1f793736f7bb27d51601061c5e112d6ef80989 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 22 Dec 2025 12:34:23 +0300
Subject: [PATCH 094/175] Fix Qwen3-32B AWQ issue

---
 python/sglang/srt/layers/quantization/awq.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/awq.py b/python/sglang/srt/layers/quantization/awq.py
index 5497900a0ce3..69f192840467 100644
--- a/python/sglang/srt/layers/quantization/awq.py
+++ b/python/sglang/srt/layers/quantization/awq.py
@@ -627,8 +627,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         qzeros_tmp = -(qzeros_tmp - 8)
         qzeros_tmp = qzeros_tmp.to(layer.scales.data.dtype)
 
-        layer.qzeros = torch.nn.Parameter(qzeros_tmp, requires_grad=False)
-        layer.qweight = torch.nn.Parameter(qweight_tmp, requires_grad=False)
+        layer.zeros = torch.nn.Parameter(qzeros_tmp, requires_grad=False)
+        layer.weight = torch.nn.Parameter(qweight_tmp, requires_grad=False)
 
     def apply(
         self,
@@ -636,9 +636,9 @@ def apply(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        qweight = layer.qweight
+        qweight = layer.weight
         scales = layer.scales
-        qzeros = layer.qzeros
+        qzeros = layer.zeros
         pack_factor = self.quant_config.pack_factor
         out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,)
         reshaped_x = x.reshape(-1, x.shape[-1])

From 7cbf9645ee185db010805f13534bf2f43c5b95e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 22 Dec 2025 13:17:31 +0300
Subject: [PATCH 095/175] Update ascend_npu_quantization.md

---
 docs/platforms/ascend_npu_quantization.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md
index 860ad950f259..172b5e295fb6 100644
--- a/docs/platforms/ascend_npu_quantization.md
+++ b/docs/platforms/ascend_npu_quantization.md
@@ -13,7 +13,6 @@ AWQ on Ascend support:
 - [x] W4A16 linear
 - [x] W8A16 linear # Test required
 - [x] W4A16 MOE # Test required
-- [x] W8A16 MOE # Test required
 
 Compressed-tensors (LLM Compressor) on Ascend support:
 - [x] W8A8 dynamic linear

From 7b20ccf164cd563151a1a4ea839d238f621ec91b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 22 Dec 2025 13:19:07 +0300
Subject: [PATCH 096/175] Update ascend_npu_quantization.md


From e1cabfa2faf521329282484fe75e5a83776f9af3 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Mon, 22 Dec 2025 17:07:47 +0300
Subject: [PATCH 097/175] Update fused_moe_method_npu.py

Static method removal commit 1/9
---
 .../npu/quantization/fused_moe_method_npu.py  | 87 +++++++++----------
 1 file changed, 43 insertions(+), 44 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 4a7f2e22845d..f1ee05f2584c 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import numpy as np
 import torch
@@ -11,6 +11,7 @@
         CombineInput,
         StandardDispatchOutput,
     )
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
 
 
 def npu_fused_experts(
@@ -138,22 +139,29 @@ def npu_fused_moe_without_routing_weights_bf16(
     return hidden_states
 
 
-class NPUW8A8Int8DynamicMoEMethod(FusedMoEMethodBase):
+class _NPUFusedMoEMethodBase(FusedMoEMethodBase):
 
-    @classmethod
-    def release_weight_cache(cls, weight: torch.Tensor):
+    def __init__(
+        self,
+        quant_config: Optional["QuantizationConfig"] = None,
+    ):
+        self.quant_config = quant_config
+
+
+class NPUW8A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase):
+
+    def _release_weight_cache(self, weight: torch.Tensor):
         # .contiguous() introduces additional memory overhead and needs to be released using resize_(0)
         origin_weight = weight.data.transpose(1, 2)
         new_weight = origin_weight.contiguous()
         origin_weight.untyped_storage().resize_(0)
         return new_weight
 
-    @classmethod
-    def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
-        weight_data = cls.release_weight_cache(layer.w13_weight.data)
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight_data = self._release_weight_cache(layer.w13_weight.data)
         layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False)
 
-        weight_data = cls.release_weight_cache(layer.w2_weight.data)
+        weight_data = self._release_weight_cache(layer.w2_weight.data)
         layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False)
 
         layer.w13_weight_scale = torch.nn.Parameter(
@@ -178,8 +186,8 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
         layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
         layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
 
-    @staticmethod
     def apply(
+        self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
@@ -203,8 +211,8 @@ def apply(
         )
         return StandardCombineInput(hidden_states=output)
 
-    @staticmethod
     def apply_without_routing_weights(
+        self,
         layer,
         hidden_states,
         hidden_states_scale,
@@ -251,12 +259,9 @@ def apply_without_routing_weights(
         return hidden_states
 
 
-class NPUW4A8Int8DynamicMoEMethod(FusedMoEMethodBase):
+class NPUW4A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase):
 
-    @classmethod
-    def process_scale(
-        cls, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight
-    ):
+    def _process_scale(self, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight):
         scale = scale.transpose(1, 2).contiguous()
 
         if is_per_channel_weight:
@@ -289,8 +294,7 @@ def process_scale(
         sscale_uint64_tensor = sscale_uint64_tensor.npu()
         return sscale_uint64_tensor, bias
 
-    @classmethod
-    def update_bias(cls, layer, w13_bias, w2_bias):
+    def _update_bias(self, layer, w13_bias, w2_bias):
         layer.w13_scale_bias.data = (
             layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
         )
@@ -298,17 +302,15 @@ def update_bias(cls, layer, w13_bias, w2_bias):
             layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
         )
 
-    @classmethod
-    def pack_to_int32(cls, weight: torch.Tensor):
+    def _pack_to_int32(self, weight: torch.Tensor):
         # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
         assert (
             weight.shape[-1] % 4 == 0
         ), "the last dim of weight needs to be divided by 4"
         return weight.view(torch.int32).contiguous()
 
-    @classmethod
     def process_weights_after_loading(
-        cls, layer: torch.nn.Module, is_per_channel_weight
+        self, layer: torch.nn.Module, is_per_channel_weight
     ) -> None:
         layer.w13_weight = torch.nn.Parameter(
             layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False
@@ -327,17 +329,17 @@ def process_weights_after_loading(
             if hasattr(layer, "w2_weight_scale_second")
             else None
         )
-        layer.w13_weight_scale.data, w13_bias = cls.process_scale(
+        layer.w13_weight_scale.data, w13_bias = self._process_scale(
             layer.w13_weight,
             layer.w13_weight_scale.data,
             w13_weight_scale_second,
-            is_per_channel_weight,
+            is_per_channel_weight
         )
-        layer.w2_weight_scale.data, w2_bias = cls.process_scale(
+        layer.w2_weight_scale.data, w2_bias = self._process_scale(
             layer.w2_weight,
             layer.w2_weight_scale.data,
             w2_weight_scale_second,
-            is_per_channel_weight,
+            is_per_channel_weight
         )
         if hasattr(layer, "w13_weight_scale_second"):
             # scale_second is no longer used, release this part of the memory
@@ -346,15 +348,15 @@ def process_weights_after_loading(
             del layer.w13_weight_offset_second
             del layer.w2_weight_offset_second
 
-        cls.update_bias(layer, w13_bias, w2_bias)
+        self._update_bias(layer, w13_bias, w2_bias)
 
         layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
         layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
-        layer.w13_weight.data = cls.pack_to_int32(layer.w13_weight.data)
-        layer.w2_weight.data = cls.pack_to_int32(layer.w2_weight.data)
+        layer.w13_weight.data = self._pack_to_int32(layer.w13_weight.data)
+        layer.w2_weight.data = self._pack_to_int32(layer.w2_weight.data)
 
-    @staticmethod
     def apply(
+        self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
@@ -437,8 +439,8 @@ def apply(
 
         return StandardCombineInput(hidden_states=final_hidden_states)
 
-    @staticmethod
     def apply_without_routing_weights(
+        self,
         layer,
         hidden_states,
         hidden_states_scale,
@@ -481,10 +483,9 @@ def apply_without_routing_weights(
         return hidden_states
 
 
-class NPUW4A16Int4DynamicMoEMethod(FusedMoEMethodBase):
+class NPUW4A16Int4DynamicMoEMethod(_NPUFusedMoEMethodBase):
 
-    @classmethod
-    def pack_to_int32(cls, weight: torch.Tensor):
+    def _pack_to_int32(self, weight: torch.Tensor):
         assert weight.dim() == 3
         if weight.dtype == torch.int32:
             # pack 8 int4 to int32, we use a int32 to represent a int4
@@ -505,9 +506,8 @@ def pack_to_int32(cls, weight: torch.Tensor):
             raise ValueError(f"{weight.dtype=} is not supported !")
         return new_weight
 
-    @classmethod
-    def unpack_from_int32(
-        cls,
+    def _unpack_from_int32(
+        self,
         value: torch.Tensor,
         num_bits: int,
         shape: torch.Size = None,
@@ -570,8 +570,7 @@ def unpack_from_int32(
 
         return unpacked
 
-    @classmethod
-    def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         w13_weight_scale = layer.w13_weight_scale.data.transpose(-1, -2).contiguous()
         w2_weight_scale = layer.w2_weight_scale.data.transpose(-1, -2).contiguous()
         layer.w13_weight_scale = torch.nn.Parameter(
@@ -592,28 +591,28 @@ def process_weights_after_loading(cls, layer: torch.nn.Module) -> None:
         # w13_weight = layer.w13_weight.data.transpose(1, 2).contiguous()
         # w2_weight = layer.w2_weight.data.transpose(1, 2).contiguous()
         unpacked_w13_weight = (
-            cls.unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4)
+            self._unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4)
             .view(layer.w13_weight.data.shape[0], layer.w13_weight.data.shape[1], -1)
             .transpose(1, 2)
             .contiguous()
             .int()
         )
         unpacked_w2_weight = (
-            cls.unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4)
+            self._unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4)
             .view(layer.w2_weight.data.shape[0], layer.w2_weight.data.shape[1], -1)
             .transpose(1, 2)
             .contiguous()
             .int()
         )
 
-        w13_weight = cls.pack_to_int32(unpacked_w13_weight)
-        w2_weight = cls.pack_to_int32(unpacked_w2_weight)
+        w13_weight = self._pack_to_int32(unpacked_w13_weight)
+        w2_weight = self._pack_to_int32(unpacked_w2_weight)
 
         layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
         layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
 
-    @staticmethod
     def apply(
+        self,
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
@@ -640,8 +639,8 @@ def apply(
         )
         return StandardCombineInput(hidden_states=output)
 
-    @staticmethod
     def apply_without_routing_weights(
+        self,
         layer,
         hidden_states,
         hidden_states_scale,

From 734ab1dc712593e08e9eb28c54c467f87f77f14e Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Mon, 22 Dec 2025 17:08:27 +0300
Subject: [PATCH 098/175] Update linear_method_npu.py

Static method removal 2/9
---
 .../npu/quantization/linear_method_npu.py     | 81 +++++++++----------
 1 file changed, 39 insertions(+), 42 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
index 6ab0d35652d1..3a99f6ac7c3b 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
@@ -15,14 +15,36 @@ def __init__(
         self,
         quant_config: Optional["QuantizationConfig"] = None,
     ):
-        super().__init__()
         self.quant_config = quant_config
 
 
 class NPUW8A8Int8LinearMethod(_NPULinearMethodBase):
 
-    @staticmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight.data = npu_format_cast(layer.weight.data)
+
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
+        # Compressed-tensors format doesn't have this field
+        if hasattr(layer, "weight_offset"):
+            layer.weight_offset.data = layer.weight_offset.data.flatten()
+
+        expanding_factor = layer.weight.data.shape[0]
+        layer.aclnn_input_scale = torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+        layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+        layer.aclnn_input_offset = torch.nn.Parameter(
+            layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+
     def apply(
+        self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
@@ -53,8 +75,10 @@ def apply(
             output_dtype=original_dtype,
         )
 
-    @staticmethod
-    def process_weights_after_loading(layer: torch.nn.Module):
+
+class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase):
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
         layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
         layer.weight.data = npu_format_cast(layer.weight.data)
 
@@ -63,25 +87,8 @@ def process_weights_after_loading(layer: torch.nn.Module):
         if hasattr(layer, "weight_offset"):
             layer.weight_offset.data = layer.weight_offset.data.flatten()
 
-        expanding_factor = layer.weight.data.shape[0]
-        layer.aclnn_input_scale = torch.nn.Parameter(
-            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
-            requires_grad=False,
-        )
-        layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
-            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
-            requires_grad=False,
-        )
-        layer.aclnn_input_offset = torch.nn.Parameter(
-            layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
-            requires_grad=False,
-        )
-
-
-class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase):
-
-    @staticmethod
     def apply(
+        self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
@@ -97,21 +104,20 @@ def apply(
             output_dtype=original_dtype,
         )
 
-    @staticmethod
-    def process_weights_after_loading(layer: torch.nn.Module):
-        layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        layer.weight.data = npu_format_cast(layer.weight.data)
-
-        layer.weight_scale.data = layer.weight_scale.data.flatten()
-        # Compressed-tensors format doesn't have this field
-        if hasattr(layer, "weight_offset"):
-            layer.weight_offset.data = layer.weight_offset.data.flatten()
-
 
 class NPU_W4A4DynamicLinearMethod(_NPULinearMethodBase):
 
-    @staticmethod
+    def process_weights_after_loading(self, layer):
+        layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
+        layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32)
+        layer.weight_offset.data = layer.weight_offset.data.flatten()
+        layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack(
+            layer.weight.data.to(torch.int32)
+        )
+
     def apply(
+        self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
@@ -129,12 +135,3 @@ def apply(
             bias=bias,
             output_dtype=original_dtype,
         )
-
-    @staticmethod
-    def process_weights_after_loading(layer):
-        layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        layer.weight_scale.data = layer.weight_scale.data.flatten()
-        layer.weight_offset.data = layer.weight_offset.data.flatten()
-        layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack(
-            layer.weight.data.to(torch.int32)
-        )

From 93533b07bf20e1f4e95de4a7a407ac07d2240b13 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Mon, 22 Dec 2025 17:09:26 +0300
Subject: [PATCH 099/175] Update base_config.py

static method removal 3/9
---
 python/sglang/srt/layers/quantization/base_config.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/base_config.py b/python/sglang/srt/layers/quantization/base_config.py
index 8297124cc4c0..3e93cfde7e70 100644
--- a/python/sglang/srt/layers/quantization/base_config.py
+++ b/python/sglang/srt/layers/quantization/base_config.py
@@ -17,7 +17,6 @@
 class QuantizeMethodBase(ABC):
     """Base class for different quantized methods."""
 
-    @abstractmethod
     def create_weights(
         self, layer: torch.nn.Module, *weight_args, **extra_weight_attrs
     ):
@@ -44,7 +43,6 @@ def process_weights_after_loading(self, layer: nn.Module) -> None:
 class LinearMethodBase(QuantizeMethodBase):
     """Base class for different (maybe quantized) linear methods."""
 
-    @abstractmethod
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -84,7 +82,6 @@ def apply(
 
 class FusedMoEMethodBase(QuantizeMethodBase):
 
-    @abstractmethod
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -96,7 +93,6 @@ def create_weights(
     ):
         raise NotImplementedError
 
-    @abstractmethod
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
     ):

From 0cd79c63389a9598eaf1fb773deda22d9c80bd2a Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Mon, 22 Dec 2025 17:10:11 +0300
Subject: [PATCH 100/175] Update compressed_tensors_moe.py

static method removal 4/9
---
 .../compressed_tensors/compressed_tensors_moe.py    | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 39d53e88ee3f..0d876f489bbd 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -887,6 +887,7 @@ def __init__(self, quant_config: CompressedTensorsConfig):
         self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
             "input_activations"
         )
+        self.kernel = NPUW8A8Int8DynamicMoEMethod()
 
         self.static_input_scales = not self.input_quant.dynamic
         per_channel = (
@@ -973,7 +974,7 @@ def create_weights(
         layer.w2_input_scale = None
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        NPUW8A8Int8DynamicMoEMethod.process_weights_after_loading(layer)
+        self.kernel.process_weights_after_loading(layer)
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
@@ -986,7 +987,7 @@ def apply(
         dispatch_output: StandardDispatchOutput,
     ) -> CombineInput:
 
-        return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
+        return self.kernel.apply(layer, dispatch_output)
 
 
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
@@ -1310,6 +1311,8 @@ def __init__(self, quantization_config) -> None:
             ].group_size
         else:
             self.group_size = 128
+        
+        self.kernel = NPUW4A16Int4DynamicMoEMethod()
 
     # TODO: See if we can merge this method's logic
     # with CompressedTensorsWNA16MoEMethod. Need more models and tests.
@@ -1412,7 +1415,7 @@ def create_weights(
         set_weight_attrs(w2_weight_offset, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        NPUW4A16Int4DynamicMoEMethod.process_weights_after_loading(layer)
+        self.kernel.process_weights_after_loading(layer)
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
@@ -1425,7 +1428,7 @@ def apply(
         dispatch_output: StandardDispatchOutput,
     ) -> CombineInput:
 
-        return NPUW4A16Int4DynamicMoEMethod.apply(layer, dispatch_output)
+        return self.kernel.apply(layer, dispatch_output)
 
     def apply_without_routing_weights(
         self,
@@ -1436,7 +1439,7 @@ def apply_without_routing_weights(
         group_list,
         output_dtype,
     ):
-        return NPUW4A16Int4DynamicMoEMethod.apply_without_routing_weights(
+        return self.kernel.apply_without_routing_weights(
             layer,
             hidden_states,
             hidden_states_scale,

From a9d4847b80cc094b1a03d6338093a8548ee9ec18 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Mon, 22 Dec 2025 17:12:08 +0300
Subject: [PATCH 101/175] Update compressed_tensors_w8a8_int8.py

static method removal 5/9
---
 .../schemes/compressed_tensors_w8a8_int8.py                  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 6db89e9f1ac2..efcd4b611fa9 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -188,13 +188,14 @@ def __init__(
             raise NotImplementedError(
                 "Static compressed-tensors scheme is not yet supported on NPU."
             )
+        self.kernel = NPUW8A8Int8DynamicLinearMethod()
 
     @classmethod
     def get_min_capability(cls) -> int:
         return NotImplementedError
 
     def process_weights_after_loading(self, layer):
-        return NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
+        return self.kernel.process_weights_after_loading(layer)
 
     def apply_weights(self, layer, x, bias):
-        return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias)
+        return self.kernel.apply(layer, x, bias)

From af3756b101b09448bb90887725cf6c75242bc7de Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Mon, 22 Dec 2025 17:12:40 +0300
Subject: [PATCH 102/175] Update msmodelslim.py

static method removal 6/9
---
 .../srt/layers/quantization/msmodelslim/msmodelslim.py       | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
index dc43e6b79b5d..61913209da4a 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
@@ -150,10 +150,6 @@ def get_quant_method(
             if self.is_layer_skipped(prefix, packed_modules_mapping_subset):
                 return UnquantizedLinearMethod()
             scheme = self.get_scheme(layer=layer, layer_name=prefix_in_quant_config)
-            if scheme is None:
-                raise NotImplementedError(
-                    "At the moment SGLang on Ascend supports only w4a4 dynamic, w8a8 static/dynamic linear schemes."
-                )
             layer.scheme = scheme
             return ModelSlimLinearMethod(self)
         elif isinstance(layer, FusedMoE):
@@ -174,6 +170,7 @@ def _get_scheme_from_parts(
             return ModelSlimW4A4Int4(
                 quant_config=self.quant_description, prefix=layer_name
             )
+        raise NotImplementedError("No modelslim compatible scheme was found.")
 
     def get_scheme(
         self, layer: torch.nn.Module, layer_name: Optional[str] = None

From 1ddd8d440153675f448f8e390fb0fcfb5bdc6f61 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Mon, 22 Dec 2025 17:13:04 +0300
Subject: [PATCH 103/175] Update msmodelslim_moe.py

static method removal 7/9
---
 .../msmodelslim/msmodelslim_moe.py            | 32 ++++++++++++-------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 5cec89a39773..435ba8a4f945 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -52,11 +52,21 @@ def get_moe_method(
             quant_config.quant_description.get(prefix_in_quant_config, "STATIC")
             == "W4A8_DYNAMIC"
         )
-
+        is_moe_w8a8_dynamic = (
+            quant_config.quant_description.get(prefix_in_quant_config, "STATIC")
+            == "W8A8_DYNAMIC"
+        )
         if is_moe_w4a8_dynamic:
+            logger.info_once("Using ModelSlimW4A8Int8MoE")
             return ModelSlimW4A8Int8MoE(quant_config)
-
-        return ModelSlimW8A8Int8MoE(quant_config)
+        elif is_moe_w8a8_dynamic:
+            logger.info_once("Using ModelSlimW8A8Int8MoE")
+            return ModelSlimW8A8Int8MoE(quant_config)
+        else:
+            raise RuntimeError(
+                f"Unsupported FusedMoe modelslim scheme: \
+                    {quant_config.quant_description.get(prefix_in_quant_config)}"
+            )
 
 
 class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod):
@@ -69,6 +79,7 @@ def __init__(
         self.quant_config = quant_config
         self.group_size = 0
         self.tp_size = 1
+        self.kernel = NPUW4A8Int8DynamicMoEMethod()
 
     def create_weights(
         self,
@@ -212,9 +223,7 @@ def create_weights(
         set_weight_attrs(w2_scale_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        NPUW4A8Int8DynamicMoEMethod.process_weights_after_loading(
-            layer, self.is_per_channel_weight
-        )
+        self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight)
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
@@ -226,7 +235,7 @@ def apply(
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
-        return NPUW4A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
+        return self.kernel.apply(layer, dispatch_output)
 
     def apply_without_routing_weights(
         self,
@@ -237,7 +246,7 @@ def apply_without_routing_weights(
         group_list,
         output_dtype,
     ):
-        return NPUW4A8Int8DynamicMoEMethod.apply_without_routing_weights(
+        return self.kernel.apply_without_routing_weights(
             layer,
             hidden_states,
             hidden_states_scale,
@@ -255,6 +264,7 @@ def __init__(
         prefix: str = None,
     ):
         self.quant_config = quant_config
+        self.kernel = NPUW8A8Int8DynamicMoEMethod()
 
     def create_weights(
         self,
@@ -327,7 +337,7 @@ def create_weights(
         set_weight_attrs(w2_weight_offset, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        NPUW8A8Int8DynamicMoEMethod.process_weights_after_loading(layer)
+        self.kernel.process_weights_after_loading(layer)
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
@@ -339,7 +349,7 @@ def apply(
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
-        return NPUW8A8Int8DynamicMoEMethod.apply(layer, dispatch_output)
+        return self.kernel.apply(layer, dispatch_output)
 
     def apply_without_routing_weights(
         self,
@@ -350,7 +360,7 @@ def apply_without_routing_weights(
         group_list,
         output_dtype,
     ):
-        return NPUW8A8Int8DynamicMoEMethod.apply_without_routing_weights(
+        return self.kernel.apply_without_routing_weights(
             layer,
             hidden_states,
             hidden_states_scale,

From 76a1e948bd265158fb38ba0f668a0db8e35baa7a Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Mon, 22 Dec 2025 17:13:54 +0300
Subject: [PATCH 104/175] Update msmodelslim_w4a4_int4.py

static method removal 8/9
---
 .../msmodelslim/schemes/msmodelslim_w4a4_int4.py             | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
index 1b578837c8d4..6fb7561cc438 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
@@ -22,6 +22,7 @@ def __init__(
     ):
         self.quant_config = quant_config
         self.is_dynamic = self.quant_config[prefix + ".weight"] == "W4A4_DYNAMIC"
+        self.kernel = NPU_W4A4DynamicLinearMethod()
 
     @staticmethod
     def get_weight(
@@ -87,7 +88,7 @@ def create_weights(
             set_weight_attrs(param, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer):
-        NPU_W4A4DynamicLinearMethod.process_weights_after_loading(layer)
+        self.kernel.process_weights_after_loading(layer)
 
     def apply_weights(
         self,
@@ -95,4 +96,4 @@ def apply_weights(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        return NPU_W4A4DynamicLinearMethod.apply(layer, x, bias)
+        return self.kernel.apply(layer, x, bias)

From f773ee4b92c10968dd23c344ef3ad1f17baeb7b2 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Mon, 22 Dec 2025 17:14:19 +0300
Subject: [PATCH 105/175] Update msmodelslim_w8a8_int8.py

static method removal 9/9
---
 .../msmodelslim/schemes/msmodelslim_w8a8_int8.py   | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
index 8250c7c4c576..1e1e99fc174d 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
@@ -28,6 +28,10 @@ def __init__(
         self.is_dynamic = (
             self.quant_config.get(prefix + ".weight", "") == "W8A8_DYNAMIC"
         )
+        if self.is_dynamic:
+            self.kernel = NPUW8A8Int8DynamicLinearMethod()
+        else:
+            self.kernel = NPUW8A8Int8LinearMethod()
 
     def create_weights(
         self,
@@ -102,10 +106,7 @@ def create_weights(
             layer.register_parameter("deq_scale", deq_scale)
 
     def process_weights_after_loading(self, layer: torch.nn.Module):
-        if self.is_dynamic:
-            NPUW8A8Int8DynamicLinearMethod.process_weights_after_loading(layer)
-        else:
-            NPUW8A8Int8LinearMethod.process_weights_after_loading(layer)
+            self.kernel.process_weights_after_loading(layer)
 
     def apply_weights(
         self,
@@ -113,7 +114,4 @@ def apply_weights(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if self.is_dynamic:
-            return NPUW8A8Int8DynamicLinearMethod.apply(layer, x, bias)
-        else:
-            return NPUW8A8Int8LinearMethod.apply(layer, x, bias)
+            return self.kernel.apply(layer, x, bias)

From a6d161985e53f243eb2a2d2c66ad78d9a216ad70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 23 Dec 2025 11:39:53 +0300
Subject: [PATCH 106/175] Update msmodelslim_moe.py

---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 435ba8a4f945..ff8639b7678e 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -235,7 +235,11 @@ def apply(
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
-        return self.kernel.apply(layer, dispatch_output)
+        # FIXME W4A8 without EP gives 0 accuracy
+        raise NotImplementedError(
+            f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep"
+        )
+        # return self.kernel.apply(layer, dispatch_output)
 
     def apply_without_routing_weights(
         self,

From 789c2468b69609ae5bdeb80ab6aab404584bbbf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 23 Dec 2025 11:45:54 +0300
Subject: [PATCH 107/175] Fix lint issue

---
 .../npu/quantization/fused_moe_method_npu.py              | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index f1ee05f2584c..05ace7966a79 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -261,7 +261,9 @@ def apply_without_routing_weights(
 
 class NPUW4A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase):
 
-    def _process_scale(self, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight):
+    def _process_scale(
+        self, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight
+    ):
         scale = scale.transpose(1, 2).contiguous()
 
         if is_per_channel_weight:
@@ -333,13 +335,13 @@ def process_weights_after_loading(
             layer.w13_weight,
             layer.w13_weight_scale.data,
             w13_weight_scale_second,
-            is_per_channel_weight
+            is_per_channel_weight,
         )
         layer.w2_weight_scale.data, w2_bias = self._process_scale(
             layer.w2_weight,
             layer.w2_weight_scale.data,
             w2_weight_scale_second,
-            is_per_channel_weight
+            is_per_channel_weight,
         )
         if hasattr(layer, "w13_weight_scale_second"):
             # scale_second is no longer used, release this part of the memory

From 2cc4db4a3faccb94d867d2075f50f911999805a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 23 Dec 2025 11:46:59 +0300
Subject: [PATCH 108/175] Fix lint issue

---
 .../quantization/compressed_tensors/compressed_tensors_moe.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 0d876f489bbd..1947e331c740 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1311,7 +1311,7 @@ def __init__(self, quantization_config) -> None:
             ].group_size
         else:
             self.group_size = 128
-        
+
         self.kernel = NPUW4A16Int4DynamicMoEMethod()
 
     # TODO: See if we can merge this method's logic

From 94827ef52e428c8510d8bcca8b46c5be5e5db2ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 23 Dec 2025 11:47:47 +0300
Subject: [PATCH 109/175] Fix lint issue

---
 .../quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
index 1e1e99fc174d..9986e1976eaf 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
@@ -106,7 +106,7 @@ def create_weights(
             layer.register_parameter("deq_scale", deq_scale)
 
     def process_weights_after_loading(self, layer: torch.nn.Module):
-            self.kernel.process_weights_after_loading(layer)
+        self.kernel.process_weights_after_loading(layer)
 
     def apply_weights(
         self,
@@ -114,4 +114,4 @@ def apply_weights(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-            return self.kernel.apply(layer, x, bias)
+        return self.kernel.apply(layer, x, bias)

From 1a30a428fd6bd05237ccbb8d21c26eccfc02aba3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 23 Dec 2025 15:12:24 +0300
Subject: [PATCH 110/175] Change local path to modelscope

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index c2251ec94a9d..fbce7bdd9327 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -21,6 +21,8 @@
     popen_launch_server,
 )
 
+os.environ['SGLANG_USE_MODELSCOPE'] = 'true'
+
 if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
     os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
 DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
@@ -32,7 +34,7 @@
 class TestAscendW4A4(CustomTestCase):
     @classmethod
     def setUpClass(cls):
-        cls.model = "/root/.cache/modelscope/hub/models/msit/Qwen3-8B-W4A4/"
+        cls.model = "Eco-Tech/Qwen3-8B-w4a4-QuaRot"
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.process = popen_launch_server(
             cls.model,

From f539100c86df00d36e062f90a14440f7dd43c49d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 23 Dec 2025 15:15:52 +0300
Subject: [PATCH 111/175] Update test_ascend_w4a4_quantization.py

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index fbce7bdd9327..9525b0407a08 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -21,7 +21,7 @@
     popen_launch_server,
 )
 
-os.environ['SGLANG_USE_MODELSCOPE'] = 'true'
+os.environ["SGLANG_USE_MODELSCOPE"] = "true"
 
 if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
     os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"

From 01f6c58a938d526540ef83b76273b61d861cfcd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 23 Dec 2025 16:02:21 +0300
Subject: [PATCH 112/175] Temporary fix

---
 test/srt/run_suite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index d09d484339a0..fb093110ae6d 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -370,7 +370,7 @@
         TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400),
         TestFile("ascend/test_ascend_tp2_bf16.py", 400),
         TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400),
-        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
+        TestFile("ascend/test_ascend_w4a4_quantization.py", 1600),
     ],
     "per-commit-4-npu-a2": [
         TestFile("ascend/test_ascend_mla_w8a8int8.py", 400),

From c9a8122ef04f56f3be258c26f70cd1f5d0140142 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Wed, 24 Dec 2025 11:13:42 +0300
Subject: [PATCH 113/175] Update test_ascend_w8a8_quantization.py

---
 test/srt/ascend/test_ascend_w8a8_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/ascend/test_ascend_w8a8_quantization.py b/test/srt/ascend/test_ascend_w8a8_quantization.py
index f3f9cdff952b..e0b3545701c6 100644
--- a/test/srt/ascend/test_ascend_w8a8_quantization.py
+++ b/test/srt/ascend/test_ascend_w8a8_quantization.py
@@ -69,7 +69,7 @@ def test_gsm8k(self):
         print(metrics)
 
         self.assertGreaterEqual(metrics["accuracy"], 0.3)
-        self.assertGreaterEqual(metrics["output_throughput"], 1000)
+        self.assertGreaterEqual(metrics["output_throughput"], 700)
 
     def run_decode(self, max_new_tokens):
         response = requests.post(

From 6bb9f20b45d3f4ff77f48ffe70fc3873ca1db8fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Wed, 24 Dec 2025 11:14:33 +0300
Subject: [PATCH 114/175] Update run_suite.py

---
 test/srt/run_suite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 5687cbeaf17c..c5397ff0a23f 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -354,7 +354,7 @@
         TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400),
         TestFile("ascend/test_ascend_tp2_bf16.py", 400),
         TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400),
-        TestFile("ascend/test_ascend_w4a4_quantization.py", 1600),
+        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
     ],
     "per-commit-4-npu-a2": [
         TestFile("ascend/test_ascend_mla_w8a8int8.py", 400),

From 836dc164871bca29058e8e28ebf76de22a9ed5f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Wed, 24 Dec 2025 11:18:41 +0300
Subject: [PATCH 115/175] Update test_ascend_w4a4_quantization.py

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index 9525b0407a08..975424e1883e 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -34,7 +34,7 @@
 class TestAscendW4A4(CustomTestCase):
     @classmethod
     def setUpClass(cls):
-        cls.model = "Eco-Tech/Qwen3-8B-w4a4-QuaRot"
+        cls.model = "/root/.cache/modelscope/hub/models/Eco-Tech/Qwen3-8B-w4a4-QuaRot"
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.process = popen_launch_server(
             cls.model,

From 14b6ab8770aa9116b48cd5792e87011cc13672c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Wed, 24 Dec 2025 16:15:25 +0300
Subject: [PATCH 116/175] Update test_ascend_w4a4_quantization.py

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index 975424e1883e..e1e1c430b7b9 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -75,7 +75,7 @@ def test_gsm8k(self):
         metrics = run_eval(args)
         print(metrics)
 
-        self.assertGreaterEqual(metrics["accuracy"], 0.75)
+        self.assertGreaterEqual(metrics["accuracy"], 0.50)
         self.assertGreaterEqual(metrics["output_throughput"], 700)
 
     def run_decode(self, max_new_tokens):

From 15040ccb2e6b4c244c4933520de9de60ddddc112 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 25 Dec 2025 14:55:08 +0300
Subject: [PATCH 117/175] Update msmodelslim_moe.py

Debug pring for CI
---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index ff8639b7678e..f9f677e5d520 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -63,6 +63,7 @@ def get_moe_method(
             logger.info_once("Using ModelSlimW8A8Int8MoE")
             return ModelSlimW8A8Int8MoE(quant_config)
         else:
+            print(f"Layer: {prefix_in_quant_config}")
             raise RuntimeError(
                 f"Unsupported FusedMoe modelslim scheme: \
                     {quant_config.quant_description.get(prefix_in_quant_config)}"

From 5a1c7ece5d7bf3b54ec7c8df860538502c47fe28 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Thu, 25 Dec 2025 19:00:39 +0300
Subject: [PATCH 118/175] Update msmodelslim_moe.py

ModelSlim unquant MoE layer processing
---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index f9f677e5d520..6c71523ff16a 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -63,11 +63,12 @@ def get_moe_method(
             logger.info_once("Using ModelSlimW8A8Int8MoE")
             return ModelSlimW8A8Int8MoE(quant_config)
         else:
-            print(f"Layer: {prefix_in_quant_config}")
-            raise RuntimeError(
+            logger.warning(
                 f"Unsupported FusedMoe modelslim scheme: \
-                    {quant_config.quant_description.get(prefix_in_quant_config)}"
+                    {quant_config.quant_description.get(prefix_in_quant_config.strip())} \
+                    in layer: {prefix}"
             )
+            return None
 
 
 class ModelSlimW4A8Int8MoE(ModelSlimMoEMethod):

From a26d9e66196f38bc8c43adddc5c38ad9d552ed80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 25 Dec 2025 19:05:07 +0300
Subject: [PATCH 119/175] Update run_suite.py

---
 test/srt/run_suite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index c5397ff0a23f..952272adbbac 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -354,7 +354,6 @@
         TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400),
         TestFile("ascend/test_ascend_tp2_bf16.py", 400),
         TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400),
-        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
     ],
     "per-commit-4-npu-a2": [
         TestFile("ascend/test_ascend_mla_w8a8int8.py", 400),
@@ -362,6 +361,7 @@
         TestFile("ascend/test_ascend_tp4_bf16.py", 400),
     ],
     "per-commit-16-npu-a3": [
+        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
         TestFile("ascend/test_ascend_deepep.py", 400),
         TestFile("ascend/test_ascend_deepseek_mtp.py", 400),
     ],

From 1d4446639f8af04dcb73d285109a5029e5f00d03 Mon Sep 17 00:00:00 2001
From: Tamir Baydasov <41994229+TamirBaydasov@users.noreply.github.com>
Date: Fri, 26 Dec 2025 15:09:27 +0300
Subject: [PATCH 120/175] Add modelslim to optimized methods

---
 python/sglang/srt/configs/model_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 8c532b7a1ffc..327ef0466bcd 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -755,6 +755,7 @@ def _verify_quantization(self) -> None:
             "w4afp8",
             "petit_nvfp4",
             "quark",
+            "modelslim",
         ]
         compatible_quantization_methods = {
             "modelopt_fp8": ["modelopt"],

From 1c888e0202fdbf640c97e7fd821272afc44e3a5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 13:07:56 +0300
Subject: [PATCH 121/175] Update test_ascend_w4a4_quantization.py

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index e1e1c430b7b9..5164d72eb513 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -24,7 +24,7 @@
 os.environ["SGLANG_USE_MODELSCOPE"] = "true"
 
 if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
-    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3"
 DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
     7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
 )
@@ -34,7 +34,7 @@
 class TestAscendW4A4(CustomTestCase):
     @classmethod
     def setUpClass(cls):
-        cls.model = "/root/.cache/modelscope/hub/models/Eco-Tech/Qwen3-8B-w4a4-QuaRot"
+        cls.model = "/root/.cache/modelscope/hub/models/Eco-Tech/Qwen3-32B-w4a4-LAOS"
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.process = popen_launch_server(
             cls.model,
@@ -47,7 +47,7 @@ def setUpClass(cls):
                 "--attention-backend",
                 "ascend",
                 "--tp-size",
-                "2",
+                "4",
                 "--mem-fraction-static",
                 "0.8",
                 "--cuda-graph-bs",
@@ -66,7 +66,7 @@ def test_gsm8k(self):
         args = SimpleNamespace(
             num_shots=5,
             data_path=None,
-            num_questions=128,
+            num_questions=1319,
             max_new_tokens=512,
             parallel=64,
             host=f"http://{url.hostname}",
@@ -75,8 +75,8 @@ def test_gsm8k(self):
         metrics = run_eval(args)
         print(metrics)
 
-        self.assertGreaterEqual(metrics["accuracy"], 0.50)
-        self.assertGreaterEqual(metrics["output_throughput"], 700)
+        self.assertAlmostEqual(metrics["accuracy"], 0.84)
+        self.assertAlmostEqual(metrics["output_throughput"], 1100)
 
     def run_decode(self, max_new_tokens):
         response = requests.post(
@@ -103,7 +103,7 @@ def test_throughput(self):
         print(f"Throughput: {throughput} tokens/s")
 
         if is_in_ci():
-            self.assertGreaterEqual(throughput, 25)
+            self.assertAlmostEqual(throughput, 38)
 
 
 if __name__ == "__main__":

From 1830d749b328b12a4b863a0194b16c13abda1e7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 13:24:46 +0300
Subject: [PATCH 122/175] Resolve conflicts 1/2

---
 .../npu/quantization/fused_moe_method_npu.py  | 262 +-----------------
 1 file changed, 6 insertions(+), 256 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index f949ba1261c7..06158d8eb580 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -261,235 +261,9 @@ def apply_without_routing_weights(
 
 class NPUW4A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase):
 
-<<<<<<< HEAD
     def _process_scale(
         self, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight
     ):
-=======
-    def __init__(self, activation_use_clip: bool) -> None:
-        self.group_size = 0
-        self.tp_size = 1
-        self.activation_use_clip = activation_use_clip
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ) -> None:
-        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
-
-        self.is_per_channel_weight = self.group_size == 0
-        self.num_experts = num_experts
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
-        )
-
-        # >> weight
-        w13_output_size = intermediate_size_per_partition
-        w2_output_size = hidden_size // 2
-        w13_weight = torch.nn.Parameter(
-            torch.empty(num_experts, w13_output_size, hidden_size, dtype=torch.int8),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                w2_output_size,
-                intermediate_size_per_partition,
-                dtype=torch.int8,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-
-        # >> scale
-        weight_scale_dtype = torch.int64 if self.activation_use_clip else torch.float32
-        w13_weight_scale = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                1,
-                dtype=weight_scale_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-
-        w2_weight_scale = torch.nn.Parameter(
-            torch.empty(num_experts, hidden_size, 1, dtype=weight_scale_dtype),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-
-        # >> offset
-        w13_weight_offset = torch.nn.Parameter(
-            torch.empty(
-                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight_offset", w13_weight_offset)
-        set_weight_attrs(w13_weight_offset, extra_weight_attrs)
-
-        w2_weight_offset = torch.nn.Parameter(
-            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_offset", w2_weight_offset)
-        set_weight_attrs(w2_weight_offset, extra_weight_attrs)
-
-        # >>> special param for w4a8
-        if self.activation_use_clip:
-            self._init_activation_clip_params(
-                layer,
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                extra_weight_attrs,
-            )
-        else:
-            self._init_extra_scale_params(
-                layer,
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                extra_weight_attrs,
-            )
-
-    def _init_activation_clip_params(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        extra_weight_attrs: dict,
-    ) -> None:
-        """
-        Initializes bias and alpha parameters for quantization schemes that use activation clipping.
-
-        This helper registers `w13_bias`, `w2_bias`, and `w2_alpha`, which are required to
-        shift and scale the activations or outputs to compensate for the precision loss
-        introduced by clamping activations.
-        """
-        w13_bias = torch.nn.Parameter(
-            torch.ones(
-                num_experts, 2 * intermediate_size_per_partition, dtype=torch.float
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_bias", w13_bias)
-        set_weight_attrs(w13_bias, extra_weight_attrs)
-
-        w2_bias = torch.nn.Parameter(
-            torch.ones(num_experts, hidden_size, dtype=torch.float),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_bias", w2_bias)
-        set_weight_attrs(w2_bias, extra_weight_attrs)
-
-        w2_alpha = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float), requires_grad=False
-        )
-        layer.register_parameter("w2_alpha", w2_alpha)
-        set_weight_attrs(w2_alpha, extra_weight_attrs)
-
-    def _init_extra_scale_params(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        extra_weight_attrs: dict,
-    ) -> None:
-        """
-        Initializes additional scaling, offset, and bias parameters for quantization schemes without activation clipping.
-
-        This method registers the following parameters:
-        1. Scale Biases: `w13_scale_bias` and `w2_scale_bias`.
-        2. Secondary Quantization Params (initialized only for grouped quantization):
-            `w13_weight_scale_second`, `w13_weight_offset_second`,
-            `w2_weight_scale_second`, and `w2_weight_offset_second`.
-        """
-        if not self.is_per_channel_weight:
-            w13_weight_scale_second = torch.nn.Parameter(
-                torch.empty(
-                    num_experts,
-                    2 * intermediate_size_per_partition,
-                    hidden_size // self.group_size,
-                    dtype=torch.float32,
-                ),
-                requires_grad=False,
-            )
-            layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second)
-            set_weight_attrs(w13_weight_scale_second, extra_weight_attrs)
-
-            w13_weight_offset_second = torch.nn.Parameter(
-                torch.empty(
-                    num_experts,
-                    2 * intermediate_size_per_partition,
-                    hidden_size // self.group_size,
-                    dtype=torch.float32,
-                ),
-                requires_grad=False,
-            )
-            layer.register_parameter(
-                "w13_weight_offset_second", w13_weight_offset_second
-            )
-            set_weight_attrs(w13_weight_offset_second, extra_weight_attrs)
-
-            w2_weight_scale_second = torch.nn.Parameter(
-                torch.empty(
-                    num_experts,
-                    hidden_size,
-                    intermediate_size_per_partition // self.group_size,
-                    dtype=torch.float32,
-                ),
-                requires_grad=False,
-            )
-            layer.register_parameter("w2_weight_scale_second", w2_weight_scale_second)
-            set_weight_attrs(w2_weight_scale_second, extra_weight_attrs)
-
-            w2_weight_offset_second = torch.nn.Parameter(
-                torch.empty(
-                    num_experts,
-                    hidden_size,
-                    intermediate_size_per_partition // self.group_size,
-                    dtype=torch.float32,
-                ),
-                requires_grad=False,
-            )
-            layer.register_parameter("w2_weight_offset_second", w2_weight_offset_second)
-            set_weight_attrs(w2_weight_offset_second, extra_weight_attrs)
-
-        w13_scale_bias = torch.nn.Parameter(
-            torch.empty(
-                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_scale_bias", w13_scale_bias)
-        set_weight_attrs(w13_scale_bias, extra_weight_attrs)
-
-        w2_scale_bias = torch.nn.Parameter(
-            torch.empty(
-                num_experts, hidden_size, 16 // self.tp_size, dtype=torch.float32
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_scale_bias", w2_scale_bias)
-        set_weight_attrs(w2_scale_bias, extra_weight_attrs)
-
-    def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
->>>>>>> sglang-main/main
         scale = scale.transpose(1, 2).contiguous()
 
         if is_per_channel_weight:
@@ -537,18 +311,12 @@ def _pack_to_int32(self, weight: torch.Tensor):
         ), "the last dim of weight needs to be divided by 4"
         return weight.view(torch.int32).contiguous()
 
-<<<<<<< HEAD
-    def process_weights_after_loading(
-        self, layer: torch.nn.Module, is_per_channel_weight
-    ) -> None:
-=======
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if not self.activation_use_clip:
             self._process_weights_without_clip(layer)
         else:
             self._process_weights_with_clip(layer)
 
->>>>>>> sglang-main/main
         layer.w13_weight = torch.nn.Parameter(
             layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False
         )
@@ -573,17 +341,11 @@ def _process_weights_without_clip(self, layer: torch.nn.Module) -> None:
             if hasattr(layer, "w2_weight_scale_second")
             else None
         )
-        layer.w13_weight_scale.data, w13_bias = self._process_scale(
-            layer.w13_weight,
-            layer.w13_weight_scale.data,
-            w13_weight_scale_second,
-            is_per_channel_weight,
+        layer.w13_weight_scale.data, w13_bias = self.process_scale(
+            layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second
         )
-        layer.w2_weight_scale.data, w2_bias = self._process_scale(
-            layer.w2_weight,
-            layer.w2_weight_scale.data,
-            w2_weight_scale_second,
-            is_per_channel_weight,
+        layer.w2_weight_scale.data, w2_bias = self.process_scale(
+            layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second
         )
         if hasattr(layer, "w13_weight_scale_second"):
             # scale_second is no longer used, release this part of the memory
@@ -592,14 +354,8 @@ def _process_weights_without_clip(self, layer: torch.nn.Module) -> None:
             del layer.w13_weight_offset_second
             del layer.w2_weight_offset_second
 
-        self._update_bias(layer, w13_bias, w2_bias)
+        self.update_bias(layer, w13_bias, w2_bias)
 
-<<<<<<< HEAD
-        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
-        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
-        layer.w13_weight.data = self._pack_to_int32(layer.w13_weight.data)
-        layer.w2_weight.data = self._pack_to_int32(layer.w2_weight.data)
-=======
     def _process_weights_with_clip(self, layer: torch.nn.Module) -> None:
         w13_weight_scale = (
             layer.w13_weight_scale.data.squeeze(-1).contiguous().unsqueeze(1)
@@ -614,12 +370,6 @@ def _process_weights_with_clip(self, layer: torch.nn.Module) -> None:
         layer.w13_scale_bias = layer.w13_bias
         layer.w2_scale_bias = layer.w2_bias
 
-    def create_moe_runner(
-        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
-    ):
-        self.moe_runner_config = moe_runner_config
->>>>>>> sglang-main/main
-
     def apply(
         self,
         layer,

From 46a3570dc470a9c37b367f62d168d26c8e35f3f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 13:33:56 +0300
Subject: [PATCH 123/175] Resolve conflicts 2/2

---
 .../compressed_tensors_moe.py                 | 263 ++++++++++++++++++
 1 file changed, 263 insertions(+)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 1947e331c740..c9465828ce0a 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -16,6 +16,7 @@
     use_symmetric_memory,
 )
 from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+    NPUW4A8Int8DynamicMoEMethod,
     NPUW4A16Int4DynamicMoEMethod,
     NPUW8A8Int8DynamicMoEMethod,
 )
@@ -85,6 +86,7 @@ class GPTQMarlinState(Enum):
 __all__ = [
     "CompressedTensorsMoEMethod",
     "CompressedTensorsW4A4Nvfp4MoEMethod",
+    "NPUCompressedTensorsW4A8Int8DynamicMoEMethod",
     "CompressedTensorsW8A8Fp8MoEMethod",
     "NPUCompressedTensorsW8A8Int8MoEMethod",
     "CompressedTensorsWNA16MoEMethod",
@@ -1298,6 +1300,267 @@ def apply(
         return StandardCombineInput(hidden_states=output)
 
 
+class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(self, quantization_config) -> None:
+        self.activation_use_clip = activation_use_clip
+        self.kernel = NPUW4A8Int8DynamicMoEMethod()
+
+    # TODO: See if we can merge this method's logic
+    # with CompressedTensorsWNA8MoEMethod. Need more models and tests.
+    # @OrangeRedeng @TamirBaydasov
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        self.is_per_channel_weight = self.group_size == 0
+        self.num_experts = num_experts
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+
+        # >> weight
+        w13_output_size = intermediate_size_per_partition
+        w2_output_size = hidden_size // 2
+        w13_weight = torch.nn.Parameter(
+            torch.empty(num_experts, w13_output_size, hidden_size, dtype=torch.int8),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                w2_output_size,
+                intermediate_size_per_partition,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # >> scale
+        weight_scale_dtype = torch.int64 if self.activation_use_clip else torch.float32
+        w13_weight_scale = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                1,
+                dtype=weight_scale_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+        w2_weight_scale = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=weight_scale_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # >> offset
+        w13_weight_offset = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_offset", w13_weight_offset)
+        set_weight_attrs(w13_weight_offset, extra_weight_attrs)
+
+        w2_weight_offset = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_offset", w2_weight_offset)
+        set_weight_attrs(w2_weight_offset, extra_weight_attrs)
+
+        # >>> special param for w4a8
+        if self.activation_use_clip:
+            self._init_activation_clip_params(
+                layer,
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                extra_weight_attrs,
+            )
+        else:
+            self._init_extra_scale_params(
+                layer,
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                extra_weight_attrs,
+            )
+
+    def _init_activation_clip_params(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        extra_weight_attrs: dict,
+    ) -> None:
+        """
+        Initializes bias and alpha parameters for quantization schemes that use activation clipping.
+
+        This helper registers `w13_bias`, `w2_bias`, and `w2_alpha`, which are required to
+        shift and scale the activations or outputs to compensate for the precision loss
+        introduced by clamping activations.
+        """
+        w13_bias = torch.nn.Parameter(
+            torch.ones(
+                num_experts, 2 * intermediate_size_per_partition, dtype=torch.float
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_bias", w13_bias)
+        set_weight_attrs(w13_bias, extra_weight_attrs)
+
+        w2_bias = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, dtype=torch.float),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_bias", w2_bias)
+        set_weight_attrs(w2_bias, extra_weight_attrs)
+
+        w2_alpha = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float), requires_grad=False
+        )
+        layer.register_parameter("w2_alpha", w2_alpha)
+        set_weight_attrs(w2_alpha, extra_weight_attrs)
+
+    def _init_extra_scale_params(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        extra_weight_attrs: dict,
+    ) -> None:
+        """
+        Initializes additional scaling, offset, and bias parameters for quantization schemes without activation clipping.
+
+        This method registers the following parameters:
+        1. Scale Biases: `w13_scale_bias` and `w2_scale_bias`.
+        2. Secondary Quantization Params (initialized only for grouped quantization):
+            `w13_weight_scale_second`, `w13_weight_offset_second`,
+            `w2_weight_scale_second`, and `w2_weight_offset_second`.
+        """
+        if not self.is_per_channel_weight:
+            w13_weight_scale_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    hidden_size // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second)
+            set_weight_attrs(w13_weight_scale_second, extra_weight_attrs)
+
+            w13_weight_offset_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    hidden_size // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter(
+                "w13_weight_offset_second", w13_weight_offset_second
+            )
+            set_weight_attrs(w13_weight_offset_second, extra_weight_attrs)
+
+            w2_weight_scale_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_weight_scale_second", w2_weight_scale_second)
+            set_weight_attrs(w2_weight_scale_second, extra_weight_attrs)
+
+            w2_weight_offset_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_weight_offset_second", w2_weight_offset_second)
+            set_weight_attrs(w2_weight_offset_second, extra_weight_attrs)
+
+        w13_scale_bias = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scale_bias", w13_scale_bias)
+        set_weight_attrs(w13_scale_bias, extra_weight_attrs)
+
+        w2_scale_bias = torch.nn.Parameter(
+            torch.empty(
+                num_experts, hidden_size, 16 // self.tp_size, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scale_bias", w2_scale_bias)
+        set_weight_attrs(w2_scale_bias, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        return self.kernel.apply(layer, dispatch_output)
+
+    def apply_without_routing_weights(
+        self,
+        layer,
+        hidden_states,
+        hidden_states_scale,
+        group_list_type,
+        group_list,
+        output_dtype,
+    ):
+        return self.kernel.apply_without_routing_weights(
+            layer,
+            hidden_states,
+            hidden_states_scale,
+            group_list_type,
+            group_list,
+            output_dtype,
+        )
+
+
 class NPUCompressedTensorsW4A16Int4DynamicMoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(self, quantization_config) -> None:

From ffdc7dc3c8554b354cd18e079d67101a451513c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 13:41:26 +0300
Subject: [PATCH 124/175] Update compressed_tensors_moe.py

---
 .../quantization/compressed_tensors/compressed_tensors_moe.py   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c9465828ce0a..1b4c72937504 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1303,6 +1303,8 @@ def apply(
 class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(self, quantization_config) -> None:
+        self.group_size = 0
+        self.tp_size = 1
         self.activation_use_clip = activation_use_clip
         self.kernel = NPUW4A8Int8DynamicMoEMethod()
 

From c38e16fc7b42c7455f916b3549ab67f6e6059bc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 13:44:31 +0300
Subject: [PATCH 125/175] Update compressed_tensors_moe.py

---
 .../compressed_tensors/compressed_tensors_moe.py          | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 1b4c72937504..328f916e16e7 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -125,6 +125,14 @@ def get_moe_method(
                         "Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod"
                     )
                     return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config)
+                elif (
+                    quant_config._is_dynamic_token_w4(weight_quant, input_quant)
+                    and input_quant is not None
+                ):
+                    logger.info_once(
+                        "Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod"
+                    )
+                    return NPUCompressedTensorsW4A8Int8DynamicMoEMethod(quant_config)
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
             logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod")
             return CompressedTensorsW4A4Nvfp4MoEMethod(quant_config)

From ef216f4b177409479ef8d325010fca4dfb920f4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 13:47:18 +0300
Subject: [PATCH 126/175] Update compressed_tensors_moe.py

---
 .../compressed_tensors/compressed_tensors_moe.py            | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 328f916e16e7..519a52e6ef96 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1313,7 +1313,11 @@ class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod):
     def __init__(self, quantization_config) -> None:
         self.group_size = 0
         self.tp_size = 1
-        self.activation_use_clip = activation_use_clip
+        self.activation_use_clip = = (
+            self.quantization_config.get("config_groups", {})
+            .get("group_1", {})
+            .get("activation_use_clip", False)
+        )
         self.kernel = NPUW4A8Int8DynamicMoEMethod()
 
     # TODO: See if we can merge this method's logic

From 5d43c4ac08bdb761794dd0d5098b238cdfd8827a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 14:17:03 +0300
Subject: [PATCH 127/175] Update compressed_tensors_moe.py

---
 .../quantization/compressed_tensors/compressed_tensors_moe.py  | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 519a52e6ef96..0fb5bc0d0d64 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1320,9 +1320,6 @@ def __init__(self, quantization_config) -> None:
         )
         self.kernel = NPUW4A8Int8DynamicMoEMethod()
 
-    # TODO: See if we can merge this method's logic
-    # with CompressedTensorsWNA8MoEMethod. Need more models and tests.
-    # @OrangeRedeng @TamirBaydasov
     def create_weights(
         self,
         layer: torch.nn.Module,

From bee77f0934aae6d7f632f2f17d2bec5ed6835d42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 14:29:26 +0300
Subject: [PATCH 128/175] Update compressed_tensors_moe.py

---
 .../compressed_tensors_moe.py                 | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 0fb5bc0d0d64..69507fc76327 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -125,14 +125,6 @@ def get_moe_method(
                         "Using NPUCompressedTensorsW4A16Int4DynamicMoEMethod"
                     )
                     return NPUCompressedTensorsW4A16Int4DynamicMoEMethod(quant_config)
-                elif (
-                    quant_config._is_dynamic_token_w4(weight_quant, input_quant)
-                    and input_quant is not None
-                ):
-                    logger.info_once(
-                        "Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod"
-                    )
-                    return NPUCompressedTensorsW4A8Int8DynamicMoEMethod(quant_config)
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
             logger.info_once("Using CompressedTensorsW4A4Nvfp4MoEMethod")
             return CompressedTensorsW4A4Nvfp4MoEMethod(quant_config)
@@ -147,6 +139,17 @@ def get_moe_method(
                 raise NotImplementedError(
                     f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now."
                 )
+        elif:
+            # TODO add w4a8 verification method
+            if _is_npu:
+                logger.info_once(
+                    "Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod"
+                )
+                return NPUCompressedTensorsW4A8Int8DynamicMoEMethod(quant_config)
+            else:
+                raise NotImplementedError(
+                    f"The W4A8Int8 Fused MoE scheme is implemented only for NPU for now."
+                )
         else:
             raise RuntimeError(
                 f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"

From ee59b957ec5961c81429eb225977d5bd51ffdfe5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 14:34:50 +0300
Subject: [PATCH 129/175] Update fused_moe_method_npu.py

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 06158d8eb580..eb41560daebe 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -311,7 +311,7 @@ def _pack_to_int32(self, weight: torch.Tensor):
         ), "the last dim of weight needs to be divided by 4"
         return weight.view(torch.int32).contiguous()
 
-        def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if not self.activation_use_clip:
             self._process_weights_without_clip(layer)
         else:

From 02d7a6a1688ef266c44415470fc8489538c8ea7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 14:37:18 +0300
Subject: [PATCH 130/175] Update msmodelslim_moe.py

---
 .../msmodelslim/msmodelslim_moe.py            | 88 ++++++++++++++++++-
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 6c71523ff16a..bf59a22c25a1 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -81,9 +81,14 @@ def __init__(
         self.quant_config = quant_config
         self.group_size = 0
         self.tp_size = 1
+        self.activation_use_clip = (
+            self.quant_description.get("config_groups", {})
+            .get("group_1", {})
+            .get("activation_use_clip", False)
+        )
         self.kernel = NPUW4A8Int8DynamicMoEMethod()
 
-    def create_weights(
+def create_weights(
         self,
         layer: torch.nn.Module,
         num_experts: int,
@@ -122,9 +127,13 @@ def create_weights(
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
         # >> scale
+        weight_scale_dtype = torch.int64 if self.activation_use_clip else torch.float32
         w13_weight_scale = torch.nn.Parameter(
             torch.empty(
-                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+                num_experts,
+                2 * intermediate_size_per_partition,
+                1,
+                dtype=weight_scale_dtype,
             ),
             requires_grad=False,
         )
@@ -132,7 +141,7 @@ def create_weights(
         set_weight_attrs(w13_weight_scale, extra_weight_attrs)
 
         w2_weight_scale = torch.nn.Parameter(
-            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            torch.empty(num_experts, hidden_size, 1, dtype=weight_scale_dtype),
             requires_grad=False,
         )
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
@@ -156,6 +165,77 @@ def create_weights(
         set_weight_attrs(w2_weight_offset, extra_weight_attrs)
 
         # >>> special param for w4a8
+        if self.activation_use_clip:
+            self._init_activation_clip_params(
+                layer,
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                extra_weight_attrs,
+            )
+        else:
+            self._init_extra_scale_params(
+                layer,
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                extra_weight_attrs,
+            )
+
+    def _init_activation_clip_params(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        extra_weight_attrs: dict,
+    ) -> None:
+        """
+        Initializes bias and alpha parameters for quantization schemes that use activation clipping.
+
+        This helper registers `w13_bias`, `w2_bias`, and `w2_alpha`, which are required to
+        shift and scale the activations or outputs to compensate for the precision loss
+        introduced by clamping activations.
+        """
+        w13_bias = torch.nn.Parameter(
+            torch.ones(
+                num_experts, 2 * intermediate_size_per_partition, dtype=torch.float
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_bias", w13_bias)
+        set_weight_attrs(w13_bias, extra_weight_attrs)
+
+        w2_bias = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, dtype=torch.float),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_bias", w2_bias)
+        set_weight_attrs(w2_bias, extra_weight_attrs)
+
+        w2_alpha = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float), requires_grad=False
+        )
+        layer.register_parameter("w2_alpha", w2_alpha)
+        set_weight_attrs(w2_alpha, extra_weight_attrs)
+
+    def _init_extra_scale_params(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        extra_weight_attrs: dict,
+    ) -> None:
+        """
+        Initializes additional scaling, offset, and bias parameters for quantization schemes without activation clipping.
+
+        This method registers the following parameters:
+        1. Scale Biases: `w13_scale_bias` and `w2_scale_bias`.
+        2. Secondary Quantization Params (initialized only for grouped quantization):
+            `w13_weight_scale_second`, `w13_weight_offset_second`,
+            `w2_weight_scale_second`, and `w2_weight_offset_second`.
+        """
         if not self.is_per_channel_weight:
             w13_weight_scale_second = torch.nn.Parameter(
                 torch.empty(
@@ -168,6 +248,7 @@ def create_weights(
             )
             layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second)
             set_weight_attrs(w13_weight_scale_second, extra_weight_attrs)
+
             w13_weight_offset_second = torch.nn.Parameter(
                 torch.empty(
                     num_experts,
@@ -224,6 +305,7 @@ def create_weights(
         layer.register_parameter("w2_scale_bias", w2_scale_bias)
         set_weight_attrs(w2_scale_bias, extra_weight_attrs)
 
+
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight)
 

From ff41d738a6ef0bb35fc2515f1f3d5ebc82759704 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 14:47:56 +0300
Subject: [PATCH 131/175] Update compressed_tensors_moe.py

---
 .../quantization/compressed_tensors/compressed_tensors_moe.py  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 69507fc76327..5b68018ebe17 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1313,6 +1313,7 @@ def apply(
 
 class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod):
 
+    ### TODO: Get rid of code duplication with python/sglang/srt/msmodelslim/msmodelslim_moe.py @OrangeRedeng @TamirBaydasov
     def __init__(self, quantization_config) -> None:
         self.group_size = 0
         self.tp_size = 1
@@ -1322,7 +1323,7 @@ def __init__(self, quantization_config) -> None:
             .get("activation_use_clip", False)
         )
         self.kernel = NPUW4A8Int8DynamicMoEMethod()
-
+    
     def create_weights(
         self,
         layer: torch.nn.Module,

From 8d1bb48b67200273e4a5d503e1d0cb66aed4bff3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 14:53:00 +0300
Subject: [PATCH 132/175] Fix lint issue

---
 .../quantization/compressed_tensors/compressed_tensors_moe.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 5b68018ebe17..df056208cd9b 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1323,7 +1323,7 @@ def __init__(self, quantization_config) -> None:
             .get("activation_use_clip", False)
         )
         self.kernel = NPUW4A8Int8DynamicMoEMethod()
-    
+
     def create_weights(
         self,
         layer: torch.nn.Module,

From 6b46093e7e3365ac684bcbb5c80b455e1c1b8825 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 15:21:27 +0300
Subject: [PATCH 133/175] Fix lint issue

---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index bf59a22c25a1..8a09950f0e6b 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -88,7 +88,7 @@ def __init__(
         )
         self.kernel = NPUW4A8Int8DynamicMoEMethod()
 
-def create_weights(
+    def create_weights(
         self,
         layer: torch.nn.Module,
         num_experts: int,

From 567a771bdcd29c32567517c702d047aa9ae3caed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 15:28:24 +0300
Subject: [PATCH 134/175] Update compressed_tensors_moe.py

---
 .../quantization/compressed_tensors/compressed_tensors_moe.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index df056208cd9b..0ca79ef4db1f 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -139,7 +139,7 @@ def get_moe_method(
                 raise NotImplementedError(
                     f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now."
                 )
-        elif:
+        elif quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is not None:
             # TODO add w4a8 verification method
             if _is_npu:
                 logger.info_once(

From 1b2f289382231c985ac18437d9921fab3664eaa5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 15:33:21 +0300
Subject: [PATCH 135/175] Update msmodelslim_moe.py

---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 8a09950f0e6b..2171295eb317 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -305,7 +305,6 @@ def _init_extra_scale_params(
         layer.register_parameter("w2_scale_bias", w2_scale_bias)
         set_weight_attrs(w2_scale_bias, extra_weight_attrs)
 
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight)
 

From fe7067c0241986f1df1498b36fba611d62c0437f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 15:33:58 +0300
Subject: [PATCH 136/175] Update compressed_tensors_moe.py

---
 .../quantization/compressed_tensors/compressed_tensors_moe.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 0ca79ef4db1f..6b2421347993 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1317,7 +1317,7 @@ class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod):
     def __init__(self, quantization_config) -> None:
         self.group_size = 0
         self.tp_size = 1
-        self.activation_use_clip = = (
+        self.activation_use_clip = (
             self.quantization_config.get("config_groups", {})
             .get("group_1", {})
             .get("activation_use_clip", False)

From 2e390e3da605aa3aec9e9644e7dcdbb4edcea8e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 15:42:04 +0300
Subject: [PATCH 137/175] Fix lint issue

---
 .../compressed_tensors/compressed_tensors_moe.py         | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 6b2421347993..ee78d6706023 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -139,12 +139,13 @@ def get_moe_method(
                 raise NotImplementedError(
                     f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now."
                 )
-        elif quant_config._is_dynamic_token_w4(weight_quant, input_quant) and input_quant is not None:
+        elif (
+            quant_config._is_dynamic_token_w4(weight_quant, input_quant) 
+            and input_quant is not None\
+        ):
             # TODO add w4a8 verification method
             if _is_npu:
-                logger.info_once(
-                    "Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod"
-                )
+                logger.info_once("Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod")
                 return NPUCompressedTensorsW4A8Int8DynamicMoEMethod(quant_config)
             else:
                 raise NotImplementedError(

From ee17e0cc8cb1d6c4199688a642521e760a0f7060 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 15:52:53 +0300
Subject: [PATCH 138/175] Update msmodelslim_moe.py

---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 2171295eb317..ed379debbb67 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -82,7 +82,7 @@ def __init__(
         self.group_size = 0
         self.tp_size = 1
         self.activation_use_clip = (
-            self.quant_description.get("config_groups", {})
+            self.quant_config.get("config_groups", {})
             .get("group_1", {})
             .get("activation_use_clip", False)
         )

From 662fadaaf70f605487bf58bffb78fbc1ef810704 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 15:57:13 +0300
Subject: [PATCH 139/175] Update msmodelslim_moe.py

---
 .../msmodelslim/msmodelslim_moe.py            | 63 +------------------
 1 file changed, 2 insertions(+), 61 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index ed379debbb67..2f1cfd57fb03 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -81,11 +81,7 @@ def __init__(
         self.quant_config = quant_config
         self.group_size = 0
         self.tp_size = 1
-        self.activation_use_clip = (
-            self.quant_config.get("config_groups", {})
-            .get("group_1", {})
-            .get("activation_use_clip", False)
-        )
+        self.activation_use_clip = False
         self.kernel = NPUW4A8Int8DynamicMoEMethod()
 
     def create_weights(
@@ -181,61 +177,6 @@ def create_weights(
                 intermediate_size_per_partition,
                 extra_weight_attrs,
             )
-
-    def _init_activation_clip_params(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        extra_weight_attrs: dict,
-    ) -> None:
-        """
-        Initializes bias and alpha parameters for quantization schemes that use activation clipping.
-
-        This helper registers `w13_bias`, `w2_bias`, and `w2_alpha`, which are required to
-        shift and scale the activations or outputs to compensate for the precision loss
-        introduced by clamping activations.
-        """
-        w13_bias = torch.nn.Parameter(
-            torch.ones(
-                num_experts, 2 * intermediate_size_per_partition, dtype=torch.float
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_bias", w13_bias)
-        set_weight_attrs(w13_bias, extra_weight_attrs)
-
-        w2_bias = torch.nn.Parameter(
-            torch.ones(num_experts, hidden_size, dtype=torch.float),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_bias", w2_bias)
-        set_weight_attrs(w2_bias, extra_weight_attrs)
-
-        w2_alpha = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float), requires_grad=False
-        )
-        layer.register_parameter("w2_alpha", w2_alpha)
-        set_weight_attrs(w2_alpha, extra_weight_attrs)
-
-    def _init_extra_scale_params(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        extra_weight_attrs: dict,
-    ) -> None:
-        """
-        Initializes additional scaling, offset, and bias parameters for quantization schemes without activation clipping.
-
-        This method registers the following parameters:
-        1. Scale Biases: `w13_scale_bias` and `w2_scale_bias`.
-        2. Secondary Quantization Params (initialized only for grouped quantization):
-            `w13_weight_scale_second`, `w13_weight_offset_second`,
-            `w2_weight_scale_second`, and `w2_weight_offset_second`.
-        """
         if not self.is_per_channel_weight:
             w13_weight_scale_second = torch.nn.Parameter(
                 torch.empty(
@@ -306,7 +247,7 @@ def _init_extra_scale_params(
         set_weight_attrs(w2_scale_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight)
+        self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight, self.activation_use_clip)
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"

From 2fb272d65bd9817a6ce036e2d350344317f46a10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 15:59:28 +0300
Subject: [PATCH 140/175] Update compressed_tensors_moe.py

---
 .../quantization/compressed_tensors/compressed_tensors_moe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index ee78d6706023..fa02ded08c0a 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1317,6 +1317,7 @@ class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod):
     ### TODO: Get rid of code duplication with python/sglang/srt/msmodelslim/msmodelslim_moe.py @OrangeRedeng @TamirBaydasov
     def __init__(self, quantization_config) -> None:
         self.group_size = 0
+        self.is_per_channel_weight = self.group_size == 0
         self.tp_size = 1
         self.activation_use_clip = (
             self.quantization_config.get("config_groups", {})
@@ -1336,7 +1337,6 @@ def create_weights(
     ) -> None:
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
 
-        self.is_per_channel_weight = self.group_size == 0
         self.num_experts = num_experts
         extra_weight_attrs.update(
             {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
@@ -1543,7 +1543,7 @@ def _init_extra_scale_params(
         set_weight_attrs(w2_scale_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        self.kernel.process_weights_after_loading(layer)
+        self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight, self.activation_use_clip)
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig

From ae7875c9bf93774631d6c418e16dedc208d1a20c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 15:59:59 +0300
Subject: [PATCH 141/175] Update msmodelslim_moe.py

---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 2f1cfd57fb03..fa0d85f328d6 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -80,6 +80,7 @@ def __init__(
     ):
         self.quant_config = quant_config
         self.group_size = 0
+        self.is_per_channel_weight = self.group_size == 0
         self.tp_size = 1
         self.activation_use_clip = False
         self.kernel = NPUW4A8Int8DynamicMoEMethod()
@@ -95,7 +96,6 @@ def create_weights(
     ) -> None:
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
 
-        self.is_per_channel_weight = self.group_size == 0
         self.num_experts = num_experts
         extra_weight_attrs.update(
             {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}

From b4c0ebe0c6560a7a9b7edbd5ffad81728d213306 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:03:16 +0300
Subject: [PATCH 142/175] Update fused_moe_method_npu.py

---
 .../npu/quantization/fused_moe_method_npu.py         | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index eb41560daebe..b4aa715aed03 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -311,9 +311,9 @@ def _pack_to_int32(self, weight: torch.Tensor):
         ), "the last dim of weight needs to be divided by 4"
         return weight.view(torch.int32).contiguous()
 
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        if not self.activation_use_clip:
-            self._process_weights_without_clip(layer)
+    def process_weights_after_loading(self, layer: torch.nn.Module, is_per_channel_weight, activation_use_clip) -> None:
+        if not activation_use_clip:
+            self._process_weights_without_clip(layer, is_per_channel_weight)
         else:
             self._process_weights_with_clip(layer)
 
@@ -330,7 +330,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
         layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
 
-    def _process_weights_without_clip(self, layer: torch.nn.Module) -> None:
+    def _process_weights_without_clip(self, layer: torch.nn.Module, is_per_channel_weight) -> None:
         w13_weight_scale_second = (
             layer.w13_weight_scale_second.data
             if hasattr(layer, "w13_weight_scale_second")
@@ -342,10 +342,10 @@ def _process_weights_without_clip(self, layer: torch.nn.Module) -> None:
             else None
         )
         layer.w13_weight_scale.data, w13_bias = self.process_scale(
-            layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second
+            layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second, is_per_channel_weight,
         )
         layer.w2_weight_scale.data, w2_bias = self.process_scale(
-            layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second
+            layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second, is_per_channel_weight,
         )
         if hasattr(layer, "w13_weight_scale_second"):
             # scale_second is no longer used, release this part of the memory

From 897094ccb8c6e519513dd074ae0d4609552c8b73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:07:21 +0300
Subject: [PATCH 143/175] Update msmodelslim_moe.py

---
 .../msmodelslim/msmodelslim_moe.py            | 26 +++----------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index fa0d85f328d6..856bbd32574f 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -96,6 +96,7 @@ def create_weights(
     ) -> None:
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
 
+        self.is_per_channel_weight = self.group_size == 0
         self.num_experts = num_experts
         extra_weight_attrs.update(
             {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
@@ -123,13 +124,9 @@ def create_weights(
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
         # >> scale
-        weight_scale_dtype = torch.int64 if self.activation_use_clip else torch.float32
         w13_weight_scale = torch.nn.Parameter(
             torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                1,
-                dtype=weight_scale_dtype,
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
             ),
             requires_grad=False,
         )
@@ -137,7 +134,7 @@ def create_weights(
         set_weight_attrs(w13_weight_scale, extra_weight_attrs)
 
         w2_weight_scale = torch.nn.Parameter(
-            torch.empty(num_experts, hidden_size, 1, dtype=weight_scale_dtype),
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
             requires_grad=False,
         )
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
@@ -161,22 +158,6 @@ def create_weights(
         set_weight_attrs(w2_weight_offset, extra_weight_attrs)
 
         # >>> special param for w4a8
-        if self.activation_use_clip:
-            self._init_activation_clip_params(
-                layer,
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                extra_weight_attrs,
-            )
-        else:
-            self._init_extra_scale_params(
-                layer,
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                extra_weight_attrs,
-            )
         if not self.is_per_channel_weight:
             w13_weight_scale_second = torch.nn.Parameter(
                 torch.empty(
@@ -189,7 +170,6 @@ def create_weights(
             )
             layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second)
             set_weight_attrs(w13_weight_scale_second, extra_weight_attrs)
-
             w13_weight_offset_second = torch.nn.Parameter(
                 torch.empty(
                     num_experts,

From b4636259e2dd7ad31e22b306be4cbcd1ecc7c28b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:12:36 +0300
Subject: [PATCH 144/175] Fix lint issue

---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 856bbd32574f..6b74c7fe2eba 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -227,7 +227,11 @@ def create_weights(
         set_weight_attrs(w2_scale_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight, self.activation_use_clip)
+        self.kernel.process_weights_after_loading(
+            layer, 
+            self.is_per_channel_weight, 
+            self.activation_use_clip
+        )
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"

From 349dcd09986437f03d83dc701f839d4a763a7c81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:14:21 +0300
Subject: [PATCH 145/175] Fix lint issue

---
 .../compressed_tensors/compressed_tensors_moe.py       | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index fa02ded08c0a..9a302dbe92a0 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -140,8 +140,8 @@ def get_moe_method(
                     f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now."
                 )
         elif (
-            quant_config._is_dynamic_token_w4(weight_quant, input_quant) 
-            and input_quant is not None\
+            quant_config._is_dynamic_token_w4(weight_quant, input_quant)
+            and input_quant is not None
         ):
             # TODO add w4a8 verification method
             if _is_npu:
@@ -1543,7 +1543,11 @@ def _init_extra_scale_params(
         set_weight_attrs(w2_scale_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        self.kernel.process_weights_after_loading(layer, self.is_per_channel_weight, self.activation_use_clip)
+        self.kernel.process_weights_after_loading(
+            layer, 
+            self.is_per_channel_weight, 
+            self.activation_use_clip
+        )
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig

From b77689546a48d3086d74c19fe5020c2ab023c17f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:15:40 +0300
Subject: [PATCH 146/175] Update fused_moe_method_npu.py

---
 .../npu/quantization/fused_moe_method_npu.py       | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index b4aa715aed03..00acf80b793c 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -311,7 +311,9 @@ def _pack_to_int32(self, weight: torch.Tensor):
         ), "the last dim of weight needs to be divided by 4"
         return weight.view(torch.int32).contiguous()
 
-    def process_weights_after_loading(self, layer: torch.nn.Module, is_per_channel_weight, activation_use_clip) -> None:
+    def process_weights_after_loading(
+        self, layer: torch.nn.Module, is_per_channel_weight, activation_use_clip
+    ) -> None:
         if not activation_use_clip:
             self._process_weights_without_clip(layer, is_per_channel_weight)
         else:
@@ -342,10 +344,16 @@ def _process_weights_without_clip(self, layer: torch.nn.Module, is_per_channel_w
             else None
         )
         layer.w13_weight_scale.data, w13_bias = self.process_scale(
-            layer.w13_weight, layer.w13_weight_scale.data, w13_weight_scale_second, is_per_channel_weight,
+            layer.w13_weight, 
+            layer.w13_weight_scale.data, 
+            w13_weight_scale_second, 
+            is_per_channel_weight,
         )
         layer.w2_weight_scale.data, w2_bias = self.process_scale(
-            layer.w2_weight, layer.w2_weight_scale.data, w2_weight_scale_second, is_per_channel_weight,
+            layer.w2_weight, 
+            layer.w2_weight_scale.data, 
+            w2_weight_scale_second, 
+            is_per_channel_weight,
         )
         if hasattr(layer, "w13_weight_scale_second"):
             # scale_second is no longer used, release this part of the memory

From 56c8d06fdf0b04e58f9883faffb96f1bc33c6944 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:20:40 +0300
Subject: [PATCH 147/175] Fix lint issue

---
 .../npu/quantization/fused_moe_method_npu.py   | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 00acf80b793c..5ffd9d8e3a6b 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -332,7 +332,11 @@ def process_weights_after_loading(
         layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
         layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
 
-    def _process_weights_without_clip(self, layer: torch.nn.Module, is_per_channel_weight) -> None:
+    def _process_weights_without_clip(
+        self, 
+        layer: torch.nn.Module, 
+        is_per_channel_weight
+    ) -> None:
         w13_weight_scale_second = (
             layer.w13_weight_scale_second.data
             if hasattr(layer, "w13_weight_scale_second")
@@ -344,15 +348,15 @@ def _process_weights_without_clip(self, layer: torch.nn.Module, is_per_channel_w
             else None
         )
         layer.w13_weight_scale.data, w13_bias = self.process_scale(
-            layer.w13_weight, 
-            layer.w13_weight_scale.data, 
-            w13_weight_scale_second, 
+            layer.w13_weight,
+            layer.w13_weight_scale.data,
+            w13_weight_scale_second,
             is_per_channel_weight,
         )
         layer.w2_weight_scale.data, w2_bias = self.process_scale(
-            layer.w2_weight, 
-            layer.w2_weight_scale.data, 
-            w2_weight_scale_second, 
+            layer.w2_weight,
+            layer.w2_weight_scale.data,
+            w2_weight_scale_second,
             is_per_channel_weight,
         )
         if hasattr(layer, "w13_weight_scale_second"):

From 4e9c0d07b983a9385812d35f2ed73fce6fe20360 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:21:54 +0300
Subject: [PATCH 148/175] Fix lint issue

---
 .../quantization/compressed_tensors/compressed_tensors_moe.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 9a302dbe92a0..e10f9d5e50a1 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1544,9 +1544,7 @@ def _init_extra_scale_params(
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         self.kernel.process_weights_after_loading(
-            layer, 
-            self.is_per_channel_weight, 
-            self.activation_use_clip
+            layer, self.is_per_channel_weight, self.activation_use_clip
         )
 
     def create_moe_runner(

From f091ab09b8c8bdbf2aa41449697c5b720ad3323d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:22:42 +0300
Subject: [PATCH 149/175] Fix lint issue

---
 .../srt/layers/quantization/msmodelslim/msmodelslim_moe.py    | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
index 6b74c7fe2eba..c9dc5621cadf 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
@@ -228,9 +228,7 @@ def create_weights(
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         self.kernel.process_weights_after_loading(
-            layer, 
-            self.is_per_channel_weight, 
-            self.activation_use_clip
+            layer, self.is_per_channel_weight, self.activation_use_clip
         )
 
     def create_moe_runner(

From 30ea24ef8c39dc85f9c96d33b2d229fe9becaf18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:30:00 +0300
Subject: [PATCH 150/175] Fix lint issue

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 5ffd9d8e3a6b..3e26ff354fde 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -333,9 +333,7 @@ def process_weights_after_loading(
         layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
 
     def _process_weights_without_clip(
-        self, 
-        layer: torch.nn.Module, 
-        is_per_channel_weight
+        self, layer: torch.nn.Module, is_per_channel_weight
     ) -> None:
         w13_weight_scale_second = (
             layer.w13_weight_scale_second.data

From b430667326d29f3f0277a892b80d52a9fd98533d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 17:30:47 +0300
Subject: [PATCH 151/175] Update fused_moe_method_npu.py

---
 .../npu/quantization/fused_moe_method_npu.py           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 3e26ff354fde..91a5da075807 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -329,8 +329,8 @@ def process_weights_after_loading(
         layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
         layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
 
-        layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
-        layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
+        layer.w13_weight.data = self._pack_to_int32(layer.w13_weight.data)
+        layer.w2_weight.data = self._pack_to_int32(layer.w2_weight.data)
 
     def _process_weights_without_clip(
         self, layer: torch.nn.Module, is_per_channel_weight
@@ -345,13 +345,13 @@ def _process_weights_without_clip(
             if hasattr(layer, "w2_weight_scale_second")
             else None
         )
-        layer.w13_weight_scale.data, w13_bias = self.process_scale(
+        layer.w13_weight_scale.data, w13_bias = self._process_scale(
             layer.w13_weight,
             layer.w13_weight_scale.data,
             w13_weight_scale_second,
             is_per_channel_weight,
         )
-        layer.w2_weight_scale.data, w2_bias = self.process_scale(
+        layer.w2_weight_scale.data, w2_bias = self._process_scale(
             layer.w2_weight,
             layer.w2_weight_scale.data,
             w2_weight_scale_second,
@@ -364,7 +364,7 @@ def _process_weights_without_clip(
             del layer.w13_weight_offset_second
             del layer.w2_weight_offset_second
 
-        self.update_bias(layer, w13_bias, w2_bias)
+        self._update_bias(layer, w13_bias, w2_bias)
 
     def _process_weights_with_clip(self, layer: torch.nn.Module) -> None:
         w13_weight_scale = (

From 47e8406982dfb879c46647271e128a3cf7678ad1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 18:09:45 +0300
Subject: [PATCH 152/175] Update test_ascend_w4a4_quantization.py

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index 5164d72eb513..ba4049c6d8a9 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -75,8 +75,8 @@ def test_gsm8k(self):
         metrics = run_eval(args)
         print(metrics)
 
-        self.assertAlmostEqual(metrics["accuracy"], 0.84)
-        self.assertAlmostEqual(metrics["output_throughput"], 1100)
+        self.assertGreaterEqual(metrics["accuracy"], 0.80)
+        self.assertGreaterEqual(metrics["output_throughput"], 1000)
 
     def run_decode(self, max_new_tokens):
         response = requests.post(

From 97b38e4eb33c8a645ddc6ab3b2560db02a3587b8 Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Mon, 29 Dec 2025 22:02:50 +0300
Subject: [PATCH 153/175] Rename MsModelSlim -> ModelSlim

---
 docs/platforms/ascend_npu_quantization.md         | 15 ++++++++-------
 python/sglang/srt/layers/quantization/__init__.py |  2 +-
 .../compressed_tensors/compressed_tensors_moe.py  |  2 +-
 .../srt/layers/quantization/modelslim/README.md   | 14 ++++++++++++++
 .../msmodelslim.py => modelslim/modelslim.py}     |  6 ++----
 .../modelslim_moe.py}                             |  2 +-
 .../quantization/modelslim/schemes/__init__.py    | 11 +++++++++++
 .../schemes/modelslim_scheme.py}                  |  0
 .../schemes/modelslim_w4a4_int4.py}               |  2 +-
 .../schemes/modelslim_w8a8_int8.py}               |  2 +-
 .../srt/layers/quantization/msmodelslim/README.md | 14 --------------
 .../quantization/msmodelslim/schemes/__init__.py  | 11 -----------
 12 files changed, 40 insertions(+), 41 deletions(-)
 create mode 100644 python/sglang/srt/layers/quantization/modelslim/README.md
 rename python/sglang/srt/layers/quantization/{msmodelslim/msmodelslim.py => modelslim/modelslim.py} (98%)
 rename python/sglang/srt/layers/quantization/{msmodelslim/msmodelslim_moe.py => modelslim/modelslim_moe.py} (99%)
 create mode 100644 python/sglang/srt/layers/quantization/modelslim/schemes/__init__.py
 rename python/sglang/srt/layers/quantization/{msmodelslim/schemes/msmodelslim_scheme.py => modelslim/schemes/modelslim_scheme.py} (100%)
 rename python/sglang/srt/layers/quantization/{msmodelslim/schemes/msmodelslim_w4a4_int4.py => modelslim/schemes/modelslim_w4a4_int4.py} (97%)
 rename python/sglang/srt/layers/quantization/{msmodelslim/schemes/msmodelslim_w8a8_int8.py => modelslim/schemes/modelslim_w8a8_int8.py} (98%)
 delete mode 100644 python/sglang/srt/layers/quantization/msmodelslim/README.md
 delete mode 100644 python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py

diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md
index 172b5e295fb6..4c40fde6e170 100644
--- a/docs/platforms/ascend_npu_quantization.md
+++ b/docs/platforms/ascend_npu_quantization.md
@@ -2,19 +2,20 @@ Quantization on Ascend.
 
 To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` or `config.json` config.
 
-MsModelSlim on Ascend support:
+[ModelSlim on Ascend support](https://github.com/sgl-project/sglang/pull/14504):
 - [x] W4A4 dynamic linear
 - [x] W8A8 static linear
 - [x] W8A8 dynamic linear
 - [x] W4A8 dynamic MOE
 - [x] W8A8 dynamic MOE
 
-AWQ on Ascend support:
+[AWQ on Ascend support](https://github.com/sgl-project/sglang/pull/10158):
 - [x] W4A16 linear
-- [x] W8A16 linear # Test required
-- [x] W4A16 MOE # Test required
+- [x] W8A16 linear # Need to test
+- [x] W4A16 MOE # Need to test
 
 Compressed-tensors (LLM Compressor) on Ascend support:
-- [x] W8A8 dynamic linear
-- [x] W8A8 dynamic MOE
-- [x] W4A16 MOE
+- [x] [W4A8 dynamic MOE with/without activation clip](https://github.com/sgl-project/sglang/pull/14736) # Need to test
+- [x] [W4A16 MOE](https://github.com/sgl-project/sglang/pull/12759)
+- [x] [W8A8 dynamic linear](https://github.com/sgl-project/sglang/pull/14504)
+- [x] [W8A8 dynamic MOE](https://github.com/sgl-project/sglang/pull/14504)
diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
index 4aa1843a4d85..161301d6124e 100644
--- a/python/sglang/srt/layers/quantization/__init__.py
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -79,7 +79,7 @@ def override_quantization_method(self, *args, **kwargs):
     )
 
 if is_npu():
-    from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig
+    from sglang.srt.layers.quantization.modelslim.modelslim import ModelSlimConfig
 
     BASE_QUANTIZATION_METHODS.update(
         {
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index e10f9d5e50a1..15ddb1d25a8b 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1314,7 +1314,7 @@ def apply(
 
 class NPUCompressedTensorsW4A8Int8DynamicMoEMethod(CompressedTensorsMoEMethod):
 
-    ### TODO: Get rid of code duplication with python/sglang/srt/msmodelslim/msmodelslim_moe.py @OrangeRedeng @TamirBaydasov
+    ### TODO: Get rid of code duplication with python/sglang/srt/modelslim/modelslim_moe.py @OrangeRedeng @TamirBaydasov
     def __init__(self, quantization_config) -> None:
         self.group_size = 0
         self.is_per_channel_weight = self.group_size == 0
diff --git a/python/sglang/srt/layers/quantization/modelslim/README.md b/python/sglang/srt/layers/quantization/modelslim/README.md
new file mode 100644
index 000000000000..3d34b67ae712
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/modelslim/README.md
@@ -0,0 +1,14 @@
+Quantization [ModelSlim](https://gitcode.com/Ascend/msit/tree/master/modelslim) module.
+
+`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights. For models quantized with ModelSlim, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config.
+
+ModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as:
+- [x] W4A4 dynamic linear
+- [x] W8A8 static linear
+- [x] W8A8 dynamic linear
+- [x] W4A8 dynamic MOE
+- [x] W8A8 dynamic MOE
+
+Also ModelSlim module include:
+- [x] Automated config detection for modelslim format (without the need to specify --quantization modelslim flag)
+- [x] Unit-tests for w4a4 modelslim, w8a8 modelslim
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py b/python/sglang/srt/layers/quantization/modelslim/modelslim.py
similarity index 98%
rename from python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
rename to python/sglang/srt/layers/quantization/modelslim/modelslim.py
index 61913209da4a..20b6c88a1d9d 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim.py
+++ b/python/sglang/srt/layers/quantization/modelslim/modelslim.py
@@ -14,10 +14,8 @@
     QuantizeMethodBase,
 )
 from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
-from sglang.srt.layers.quantization.msmodelslim.msmodelslim_moe import (
-    ModelSlimMoEMethod,
-)
-from sglang.srt.layers.quantization.msmodelslim.schemes import (
+from sglang.srt.layers.quantization.modelslim.modelslim_moe import ModelSlimMoEMethod
+from sglang.srt.layers.quantization.modelslim.schemes import (
     ModelSlimScheme,
     ModelSlimW4A4Int4,
     ModelSlimW8A8Int8,
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py b/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py
similarity index 99%
rename from python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
rename to python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py
index c9dc5621cadf..94d1d3a660c2 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/msmodelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py
@@ -20,7 +20,7 @@
         CombineInput,
         StandardDispatchOutput,
     )
-    from sglang.srt.layers.quantization.msmodelslim.msmodelslim import ModelSlimConfig
+    from sglang.srt.layers.quantization.modelslim.modelslim import ModelSlimConfig
 
 logger = logging.getLogger(__name__)
 
diff --git a/python/sglang/srt/layers/quantization/modelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/modelslim/schemes/__init__.py
new file mode 100644
index 000000000000..551b862a4424
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/modelslim/schemes/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from .modelslim_scheme import ModelSlimScheme
+from .modelslim_w4a4_int4 import ModelSlimW4A4Int4
+from .modelslim_w8a8_int8 import ModelSlimW8A8Int8
+
+__all__ = [
+    "ModelSlimScheme",
+    "ModelSlimW8A8Int8",
+    "ModelSlimW4A4Int4",
+]
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py b/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_scheme.py
similarity index 100%
rename from python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_scheme.py
rename to python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_scheme.py
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py b/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a4_int4.py
similarity index 97%
rename from python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
rename to python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a4_int4.py
index 6fb7561cc438..8e7f08277f99 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w4a4_int4.py
+++ b/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a4_int4.py
@@ -9,7 +9,7 @@
     NPU_W4A4DynamicLinearMethod,
 )
 from sglang.srt.layers.parameter import PerTensorScaleParameter
-from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme
+from sglang.srt.layers.quantization.modelslim.schemes import ModelSlimScheme
 from sglang.srt.utils import set_weight_attrs
 
 
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py b/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8.py
similarity index 98%
rename from python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
rename to python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8.py
index 9986e1976eaf..16c62d551fa3 100644
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/msmodelslim_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8.py
@@ -14,7 +14,7 @@
     ModelWeightParameter,
     PerTensorScaleParameter,
 )
-from sglang.srt.layers.quantization.msmodelslim.schemes import ModelSlimScheme
+from sglang.srt.layers.quantization.modelslim.schemes import ModelSlimScheme
 
 
 class ModelSlimW8A8Int8(ModelSlimScheme):
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/README.md b/python/sglang/srt/layers/quantization/msmodelslim/README.md
deleted file mode 100644
index 65f5eb029323..000000000000
--- a/python/sglang/srt/layers/quantization/msmodelslim/README.md
+++ /dev/null
@@ -1,14 +0,0 @@
-Quantization [msModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module.
-
-`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights. For models quantized with MSModelSlim, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config.
-
-MsModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as:
-- [x] W4A4 dynamic linear
-- [x] W8A8 static linear
-- [x] W8A8 dynamic linear
-- [x] W4A8 dynamic MOE
-- [x] W8A8 dynamic MOE
-
-Also MsModelSlim module include:
-- [x] Automated config detection for modelslim format (without the need to specify --quantization modelslim flag)
-- [x] Unit-tests for w4a4 modelslim, w8a8 modelslim
diff --git a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py b/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
deleted file mode 100644
index fba516eed7c0..000000000000
--- a/python/sglang/srt/layers/quantization/msmodelslim/schemes/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from .msmodelslim_scheme import ModelSlimScheme
-from .msmodelslim_w4a4_int4 import ModelSlimW4A4Int4
-from .msmodelslim_w8a8_int8 import ModelSlimW8A8Int8
-
-__all__ = [
-    "ModelSlimScheme",
-    "ModelSlimW8A8Int8",
-    "ModelSlimW4A4Int4",
-]

From d6f0064f82eecb984db59b40ec0dedf88e0b3a5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 30 Dec 2025 08:40:40 +0300
Subject: [PATCH 154/175] Fix w4a4 test

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index ba4049c6d8a9..33d1f62a81b4 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -78,20 +78,6 @@ def test_gsm8k(self):
         self.assertGreaterEqual(metrics["accuracy"], 0.80)
         self.assertGreaterEqual(metrics["output_throughput"], 1000)
 
-    def run_decode(self, max_new_tokens):
-        response = requests.post(
-            self.base_url + "/generate",
-            json={
-                "text": "The capital of France is",
-                "sampling_params": {
-                    "temperature": 0,
-                    "max_new_tokens": max_new_tokens,
-                },
-                "ignore_eos": True,
-            },
-        )
-        return response.json()
-
     def test_throughput(self):
         max_tokens = 256
 
@@ -103,7 +89,7 @@ def test_throughput(self):
         print(f"Throughput: {throughput} tokens/s")
 
         if is_in_ci():
-            self.assertAlmostEqual(throughput, 38)
+            self.assertAlmostEqual(throughput, 35)
 
 
 if __name__ == "__main__":

From 0aad1d189ad69d1f0732e4dc01f67dba888bc0ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 30 Dec 2025 08:45:36 +0300
Subject: [PATCH 155/175] Fix link issue

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index 33d1f62a81b4..bcaaf5d3fc84 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -9,8 +9,6 @@
 from types import SimpleNamespace
 from urllib.parse import urlparse
 
-import requests
-
 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval
 from sglang.test.test_utils import (

From e861924a2762d6d800b48011ba502231c9b13078 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 30 Dec 2025 08:50:44 +0300
Subject: [PATCH 156/175] Return run_decode to test_ascend_w4a4_quantization.py

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index bcaaf5d3fc84..2bc0df0c1687 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -9,6 +9,8 @@
 from types import SimpleNamespace
 from urllib.parse import urlparse
 
+import requests
+
 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval
 from sglang.test.test_utils import (
@@ -76,6 +78,20 @@ def test_gsm8k(self):
         self.assertGreaterEqual(metrics["accuracy"], 0.80)
         self.assertGreaterEqual(metrics["output_throughput"], 1000)
 
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+                "ignore_eos": True,
+            },
+        )
+        return response.json()
+
     def test_throughput(self):
         max_tokens = 256
 

From a443cf976d51b6ebd992f08275824651c872b02d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 30 Dec 2025 09:34:37 +0300
Subject: [PATCH 157/175] Update modelslim_moe.py

---
 .../srt/layers/quantization/modelslim/modelslim_moe.py     | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py b/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py
index 94d1d3a660c2..095d09f31155 100644
--- a/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py
+++ b/python/sglang/srt/layers/quantization/modelslim/modelslim_moe.py
@@ -241,11 +241,8 @@ def apply(
         layer,
         dispatch_output: "StandardDispatchOutput",
     ) -> "CombineInput":
-        # FIXME W4A8 without EP gives 0 accuracy
-        raise NotImplementedError(
-            f"W4A8 only support with deepep for now, please enable --moe-a2a-backend deepep"
-        )
-        # return self.kernel.apply(layer, dispatch_output)
+        # FIXME W4A8 without EP can give 0 accuracy
+        return self.kernel.apply(layer, dispatch_output)
 
     def apply_without_routing_weights(
         self,

From 373b9c5ac967a1cff11eace94f99624dc0cfdc9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 30 Dec 2025 09:37:01 +0300
Subject: [PATCH 158/175] Fix link

---
 python/sglang/srt/layers/quantization/modelslim/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/modelslim/README.md b/python/sglang/srt/layers/quantization/modelslim/README.md
index 3d34b67ae712..dd3f35ff3fbf 100644
--- a/python/sglang/srt/layers/quantization/modelslim/README.md
+++ b/python/sglang/srt/layers/quantization/modelslim/README.md
@@ -1,4 +1,4 @@
-Quantization [ModelSlim](https://gitcode.com/Ascend/msit/tree/master/modelslim) module.
+Quantization [ModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module.
 
 `--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights. For models quantized with ModelSlim, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config.
 

From 86093bbf11c0a0d6f6256d615af6c56988ba6280 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 30 Dec 2025 09:38:41 +0300
Subject: [PATCH 159/175] Fix link again

---
 python/sglang/srt/layers/quantization/modelslim/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/modelslim/README.md b/python/sglang/srt/layers/quantization/modelslim/README.md
index dd3f35ff3fbf..d2a43d696741 100644
--- a/python/sglang/srt/layers/quantization/modelslim/README.md
+++ b/python/sglang/srt/layers/quantization/modelslim/README.md
@@ -1,4 +1,4 @@
-Quantization [ModelSlim](https://gitcode.com/Ascend/msit/tree/master/msmodelslim) module.
+Quantization [ModelSlim](https://gitcode.com/Ascend/msit) module.
 
 `--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights. For models quantized with ModelSlim, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config.
 

From 70f2fabbe544cb2d1d10a9086230a13735f55e4d Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Tue, 30 Dec 2025 11:18:22 +0300
Subject: [PATCH 160/175] Add w4a8 strategy to compressed-tensors

---
 .../compressed_tensors/compressed_tensors.py  | 22 +++++++++++++++++++
 .../compressed_tensors_moe.py                 |  6 +----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
index 0ed642950fbc..e3f9725e5440 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -310,6 +310,28 @@ def _check_scheme_supported(self, min_capability: int, error: bool = True) -> bo
         else:
             return False
 
+    def _is_dynamic_token_w4a8(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> bool:
+        is_weight_4_bits = weight_quant.num_bits == 4
+        is_activation_8_bits = input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.GROUP.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        )
+        is_token = (
+            weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
+        )
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+
+        return (
+            is_weight_4_bits
+            and is_activation_8_bits
+            and is_token
+            and weight_quant.symmetric
+            and is_dynamic
+        )
+
     def _is_static_tensor_w8a8(
         self, weight_quant: BaseModel, input_quant: BaseModel
     ) -> bool:
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 15ddb1d25a8b..7ef04ef637ad 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -139,11 +139,7 @@ def get_moe_method(
                 raise NotImplementedError(
                     f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now."
                 )
-        elif (
-            quant_config._is_dynamic_token_w4(weight_quant, input_quant)
-            and input_quant is not None
-        ):
-            # TODO add w4a8 verification method
+        elif quant_config._is_dynamic_token_w4a8(weight_quant, input_quant):
             if _is_npu:
                 logger.info_once("Using NPUCompressedTensorsW4A8Int8DynamicMoEMethod")
                 return NPUCompressedTensorsW4A8Int8DynamicMoEMethod(quant_config)

From d5ad3a19ac52b58017abac2371b7a6daddff533a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 30 Dec 2025 14:01:44 +0300
Subject: [PATCH 161/175] Fix test again

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index 2bc0df0c1687..e77adb7df159 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -103,7 +103,7 @@ def test_throughput(self):
         print(f"Throughput: {throughput} tokens/s")
 
         if is_in_ci():
-            self.assertAlmostEqual(throughput, 35)
+            self.assertGreaterEqual(throughput, 35)
 
 
 if __name__ == "__main__":

From a657e8727ffd99aeee27c96ae7e5cf127227b060 Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Sat, 3 Jan 2026 21:47:47 +0300
Subject: [PATCH 162/175] Update test order

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 4 ++--
 test/srt/run_suite.py                            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index e77adb7df159..8674786a8433 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -76,7 +76,7 @@ def test_gsm8k(self):
         print(metrics)
 
         self.assertGreaterEqual(metrics["accuracy"], 0.80)
-        self.assertGreaterEqual(metrics["output_throughput"], 1000)
+        self.assertGreaterEqual(metrics["output_throughput"], 700)
 
     def run_decode(self, max_new_tokens):
         response = requests.post(
@@ -103,7 +103,7 @@ def test_throughput(self):
         print(f"Throughput: {throughput} tokens/s")
 
         if is_in_ci():
-            self.assertGreaterEqual(throughput, 35)
+            self.assertGreaterEqual(throughput, 25)
 
 
 if __name__ == "__main__":
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index ad726b40b238..777382c83388 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -364,12 +364,12 @@
         TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400),
     ],
     "per-commit-4-npu-a2": [
+        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
         TestFile("ascend/test_ascend_mla_w8a8int8.py", 400),
         TestFile("ascend/test_ascend_hicache_mla.py", 400),
         TestFile("ascend/test_ascend_tp4_bf16.py", 400),
     ],
     "per-commit-16-npu-a3": [
-        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
         TestFile("ascend/test_ascend_deepep.py", 400),
         TestFile("ascend/test_ascend_deepseek_mtp.py", 400),
     ],

From ff565dbc927ff6a13ac11857d861e6d373e08174 Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Sun, 4 Jan 2026 13:44:46 +0300
Subject: [PATCH 163/175] Move w4a4_test to a2-tp1 suite

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 14 ++++++--------
 test/srt/run_suite.py                            |  2 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index 8674786a8433..f6982bfcaa82 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -21,10 +21,8 @@
     popen_launch_server,
 )
 
-os.environ["SGLANG_USE_MODELSCOPE"] = "true"
-
 if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
-    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3"
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0"
 DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
     7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
 )
@@ -47,11 +45,11 @@ def setUpClass(cls):
                 "--attention-backend",
                 "ascend",
                 "--tp-size",
-                "4",
+                "1",
                 "--mem-fraction-static",
                 "0.8",
                 "--cuda-graph-bs",
-                "64",
+                "16",
                 "--disable-radix-cache",
             ],
         )
@@ -68,7 +66,7 @@ def test_gsm8k(self):
             data_path=None,
             num_questions=1319,
             max_new_tokens=512,
-            parallel=64,
+            parallel=16,
             host=f"http://{url.hostname}",
             port=int(url.port),
         )
@@ -76,7 +74,7 @@ def test_gsm8k(self):
         print(metrics)
 
         self.assertGreaterEqual(metrics["accuracy"], 0.80)
-        self.assertGreaterEqual(metrics["output_throughput"], 700)
+        self.assertGreaterEqual(metrics["output_throughput"], 500)
 
     def run_decode(self, max_new_tokens):
         response = requests.post(
@@ -103,7 +101,7 @@ def test_throughput(self):
         print(f"Throughput: {throughput} tokens/s")
 
         if is_in_ci():
-            self.assertGreaterEqual(throughput, 25)
+            self.assertGreaterEqual(throughput, 15)
 
 
 if __name__ == "__main__":
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 6862cb72b061..233f0d9604e0 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -355,6 +355,7 @@
         TestFile("ascend/test_ascend_sampling_backend.py", 400),
         TestFile("ascend/test_ascend_tp1_bf16.py", 400),
         TestFile("ascend/test_ascend_compile_graph_tp1_bf16.py", 400),
+        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
         TestFile("ascend/test_ascend_w8a8_quantization.py", 400),
         TestFile("test_embed_interpolate_unittest.py", 400),
     ],
@@ -365,7 +366,6 @@
         TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400),
     ],
     "per-commit-4-npu-a2": [
-        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
         TestFile("ascend/test_ascend_mla_w8a8int8.py", 400),
         TestFile("ascend/test_ascend_hicache_mla.py", 400),
         TestFile("ascend/test_ascend_tp4_bf16.py", 400),

From c97c232aadb8d7fadb288d7da46f48078d79ad70 Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Sun, 4 Jan 2026 13:47:46 +0300
Subject: [PATCH 164/175] Move w4a4_test to a2-tp1 suite

---
 .../ascend/test_ascend_w4a4_quantization.py    | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index f6982bfcaa82..420ffd113bbf 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -9,8 +9,6 @@
 from types import SimpleNamespace
 from urllib.parse import urlparse
 
-import requests
-
 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval
 from sglang.test.test_utils import (
@@ -64,7 +62,7 @@ def test_gsm8k(self):
         args = SimpleNamespace(
             num_shots=5,
             data_path=None,
-            num_questions=1319,
+            num_questions=200,
             max_new_tokens=512,
             parallel=16,
             host=f"http://{url.hostname}",
@@ -76,20 +74,6 @@ def test_gsm8k(self):
         self.assertGreaterEqual(metrics["accuracy"], 0.80)
         self.assertGreaterEqual(metrics["output_throughput"], 500)
 
-    def run_decode(self, max_new_tokens):
-        response = requests.post(
-            self.base_url + "/generate",
-            json={
-                "text": "The capital of France is",
-                "sampling_params": {
-                    "temperature": 0,
-                    "max_new_tokens": max_new_tokens,
-                },
-                "ignore_eos": True,
-            },
-        )
-        return response.json()
-
     def test_throughput(self):
         max_tokens = 256
 

From c190ea30a66aadcb7de947f8b83fe90bbe569d1d Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Sun, 4 Jan 2026 15:02:37 +0300
Subject: [PATCH 165/175] Return w4a4 to A3

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 8 ++++----
 test/srt/run_suite.py                            | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index 420ffd113bbf..0a287e869400 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -20,7 +20,7 @@
 )
 
 if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
-    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0"
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3"
 DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
     7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
 )
@@ -43,7 +43,7 @@ def setUpClass(cls):
                 "--attention-backend",
                 "ascend",
                 "--tp-size",
-                "1",
+                "4",
                 "--mem-fraction-static",
                 "0.8",
                 "--cuda-graph-bs",
@@ -72,7 +72,7 @@ def test_gsm8k(self):
         print(metrics)
 
         self.assertGreaterEqual(metrics["accuracy"], 0.80)
-        self.assertGreaterEqual(metrics["output_throughput"], 500)
+        self.assertGreaterEqual(metrics["output_throughput"], 1000)
 
     def test_throughput(self):
         max_tokens = 256
@@ -85,7 +85,7 @@ def test_throughput(self):
         print(f"Throughput: {throughput} tokens/s")
 
         if is_in_ci():
-            self.assertGreaterEqual(throughput, 15)
+            self.assertGreaterEqual(throughput, 35)
 
 
 if __name__ == "__main__":
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 233f0d9604e0..9cc7b17ba30f 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -355,7 +355,6 @@
         TestFile("ascend/test_ascend_sampling_backend.py", 400),
         TestFile("ascend/test_ascend_tp1_bf16.py", 400),
         TestFile("ascend/test_ascend_compile_graph_tp1_bf16.py", 400),
-        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
         TestFile("ascend/test_ascend_w8a8_quantization.py", 400),
         TestFile("test_embed_interpolate_unittest.py", 400),
     ],
@@ -373,6 +372,7 @@
     "per-commit-16-npu-a3": [
         TestFile("ascend/test_ascend_deepep.py", 400),
         TestFile("ascend/test_ascend_deepseek_mtp.py", 400),
+        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
     ],
 }
 

From 659fa074b1d49307dec31515eb32cd5cd5aa5252 Mon Sep 17 00:00:00 2001
From: OrangeRedeng <sae.20@bk.ru>
Date: Sun, 4 Jan 2026 15:42:53 +0300
Subject: [PATCH 166/175] Remove unused is_npu()

---
 python/sglang/srt/configs/model_config.py         |  3 +--
 python/sglang/srt/layers/quantization/__init__.py | 11 ++---------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 327ef0466bcd..4c08ce5eace7 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -26,7 +26,7 @@
 from sglang.srt.environ import envs
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import is_hip, is_npu, retry
+from sglang.srt.utils import is_hip, retry
 from sglang.srt.utils.hf_transformers_utils import (
     get_config,
     get_context_length,
@@ -37,7 +37,6 @@
 from sglang.utils import is_in_ci
 
 logger = logging.getLogger(__name__)
-_is_npu = is_npu()
 
 
 class AttentionArch(IntEnum):
diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
index 161301d6124e..ba9755c6a04a 100644
--- a/python/sglang/srt/layers/quantization/__init__.py
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -31,6 +31,7 @@ def override_quantization_method(self, *args, **kwargs):
     ModelOptFp4Config,
     ModelOptFp8Config,
 )
+from sglang.srt.layers.quantization.modelslim.modelslim import ModelSlimConfig
 from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
 from sglang.srt.layers.quantization.mxfp4 import Mxfp4Config
 from sglang.srt.layers.quantization.petit import PetitNvFp4Config
@@ -68,6 +69,7 @@ def override_quantization_method(self, *args, **kwargs):
     "fbgemm_fp8": FBGEMMFp8Config,
     "quark": QuarkConfig,
     "auto-round": AutoRoundConfig,
+    "modelslim": ModelSlimConfig,
 }
 
 
@@ -78,15 +80,6 @@ def override_quantization_method(self, *args, **kwargs):
         }
     )
 
-if is_npu():
-    from sglang.srt.layers.quantization.modelslim.modelslim import ModelSlimConfig
-
-    BASE_QUANTIZATION_METHODS.update(
-        {
-            "modelslim": ModelSlimConfig,
-        }
-    )
-
 QUANTIZATION_METHODS = {**BASE_QUANTIZATION_METHODS}
 
 
From 4716b7320f80ed908ed721fd2c04aaf7432ad7d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 12 Jan 2026 12:34:41 +0300
Subject: [PATCH 167/175] Update test_ascend_w4a4_quantization.py

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index 0a287e869400..5e8b729966f2 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -47,7 +47,7 @@ def setUpClass(cls):
                 "--mem-fraction-static",
                 "0.8",
                 "--cuda-graph-bs",
-                "16",
+                "64",
                 "--disable-radix-cache",
             ],
         )
@@ -62,9 +62,9 @@ def test_gsm8k(self):
         args = SimpleNamespace(
             num_shots=5,
             data_path=None,
-            num_questions=200,
+            num_questions=1319,
             max_new_tokens=512,
-            parallel=16,
+            parallel=64,
             host=f"http://{url.hostname}",
             port=int(url.port),
         )

From 42d849e9c08d1d148c0a6a3ad8dbd69f103b59b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 12 Jan 2026 14:56:22 +0300
Subject: [PATCH 168/175] Fix test_ascend_piecewise_graph_prefill test

---
 test/srt/ascend/test_ascend_piecewise_graph_prefill.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/srt/ascend/test_ascend_piecewise_graph_prefill.py b/test/srt/ascend/test_ascend_piecewise_graph_prefill.py
index 9e43ca60f74b..13c9d991a0bb 100644
--- a/test/srt/ascend/test_ascend_piecewise_graph_prefill.py
+++ b/test/srt/ascend/test_ascend_piecewise_graph_prefill.py
@@ -38,7 +38,7 @@ def setUpClass(cls):
                 128,
                 "--enable-piecewise-cuda-graph",
                 "--piecewise-cuda-graph-tokens",
-                TOKENS_TO_CAPTURE,
+                *TOKENS_TO_CAPTURE,
             ],
         )
 
@@ -79,7 +79,7 @@ def test_latency(self):
                 "ascend",
                 "--enable-piecewise-cuda-graph",
                 "--piecewise-cuda-graph-tokens",
-                TOKENS_TO_CAPTURE,
+                *TOKENS_TO_CAPTURE,
             ],
         )
         self.assertLess(prefill_latency, EXP_PREFILL_LATENCY)

From 9a95ff824ecbb65b7f22355339ebf25e506e352e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 11:06:19 +0300
Subject: [PATCH 169/175] Move w4a4 test to A2

---
 test/srt/run_suite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 122e4cf8ceea..6d146c7cbc76 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -158,6 +158,7 @@
         TestFile("test_embed_interpolate_unittest.py", 400),
     ],
     "per-commit-2-npu-a2": [
+        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
         TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400),
         TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400),
         TestFile("ascend/test_ascend_tp2_bf16.py", 400),
@@ -171,7 +172,6 @@
     "per-commit-16-npu-a3": [
         TestFile("ascend/test_ascend_deepep.py", 400),
         TestFile("ascend/test_ascend_deepseek_mtp.py", 400),
-        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
     ],
 }
 

From d323c6a12e79d93ddd399184d5dcd14e248843fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 11:07:18 +0300
Subject: [PATCH 170/175] Update test_ascend_w4a4_quantization.py

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index 5e8b729966f2..e838c87f9612 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -20,7 +20,7 @@
 )
 
 if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
-    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3"
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
 DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
     7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
 )
@@ -43,7 +43,7 @@ def setUpClass(cls):
                 "--attention-backend",
                 "ascend",
                 "--tp-size",
-                "4",
+                "2",
                 "--mem-fraction-static",
                 "0.8",
                 "--cuda-graph-bs",

From 7e3d2815cc310dde4212a3f72e6c20439f89e154 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 12:00:40 +0300
Subject: [PATCH 171/175] Update run_suite.py

---
 test/srt/run_suite.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 6d146c7cbc76..581a1e09c6a2 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -158,7 +158,6 @@
         TestFile("test_embed_interpolate_unittest.py", 400),
     ],
     "per-commit-2-npu-a2": [
-        TestFile("ascend/test_ascend_w4a4_quantization.py", 400),
         TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400),
         TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400),
         TestFile("ascend/test_ascend_tp2_bf16.py", 400),
@@ -170,8 +169,9 @@
         TestFile("ascend/test_ascend_tp4_bf16.py", 400),
     ],
     "per-commit-16-npu-a3": [
-        TestFile("ascend/test_ascend_deepep.py", 400),
-        TestFile("ascend/test_ascend_deepseek_mtp.py", 400),
+        TestFile("ascend/test_ascend_deepep.py", 3600),
+        TestFile("ascend/test_ascend_deepseek_mtp.py", 2800),
+        TestFile("ascend/test_ascend_w4a4_quantization.py", 600),
     ],
 }
 

From 601a349a339fc13f95e0442566d1a740d382a9b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 12:00:58 +0300
Subject: [PATCH 172/175] Update test_ascend_w4a4_quantization.py

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index e838c87f9612..5e8b729966f2 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -20,7 +20,7 @@
 )
 
 if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
-    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3"
 DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
     7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
 )
@@ -43,7 +43,7 @@ def setUpClass(cls):
                 "--attention-backend",
                 "ascend",
                 "--tp-size",
-                "2",
+                "4",
                 "--mem-fraction-static",
                 "0.8",
                 "--cuda-graph-bs",

From 0d16e53ba6cce3ee38d6b21fef25b1c9ab5beb4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 14:59:50 +0300
Subject: [PATCH 173/175] Update test_ascend_w4a4_quantization.py

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index 5e8b729966f2..7c5e33547371 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -72,7 +72,7 @@ def test_gsm8k(self):
         print(metrics)
 
         self.assertGreaterEqual(metrics["accuracy"], 0.80)
-        self.assertGreaterEqual(metrics["output_throughput"], 1000)
+        self.assertGreaterEqual(metrics["output_throughput"], 1050)
 
     def test_throughput(self):
         max_tokens = 256

From 7b9e6143c3034a3ddeaf372e506993452b241bb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 17:47:25 +0300
Subject: [PATCH 174/175] Fix w4a4 test

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index 7c5e33547371..ee56f5b9825f 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -72,7 +72,21 @@ def test_gsm8k(self):
         print(metrics)
 
         self.assertGreaterEqual(metrics["accuracy"], 0.80)
-        self.assertGreaterEqual(metrics["output_throughput"], 1050)
+        self.assertGreaterEqual(metrics["output_throughput"], 1000)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+                "ignore_eos": True,
+            },
+        )
+        return response.json()
 
     def test_throughput(self):
         max_tokens = 256

From a79e4b9ae5b1af20993242c4199ea8bc43e175c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 18:24:15 +0300
Subject: [PATCH 175/175] Fix w4a4 test

---
 test/srt/ascend/test_ascend_w4a4_quantization.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/srt/ascend/test_ascend_w4a4_quantization.py b/test/srt/ascend/test_ascend_w4a4_quantization.py
index ee56f5b9825f..22d3f0615181 100644
--- a/test/srt/ascend/test_ascend_w4a4_quantization.py
+++ b/test/srt/ascend/test_ascend_w4a4_quantization.py
@@ -9,6 +9,8 @@
 from types import SimpleNamespace
 from urllib.parse import urlparse
 
+import requests
+
 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval
 from sglang.test.test_utils import (