sgl-project · iforgetmyname · Jan 14, 2026 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
@@ -16,6 +16,7 @@
 /python/sglang/srt/function_call @CatherineSue @JustinTong0323
 /python/sglang/srt/grpc @CatherineSue @slin1237
 /python/sglang/srt/hardware_backend/npu @ping1jing2 @iforgetmyname
+/python/sglang/srt/hardware_backend/npu/quantization @OrangeRedeng @TamirBaydasov @iforgetmyname
 /python/sglang/srt/layers @merrymercy @Ying1123 @Fridge003 @ispobock @HaiShaw @ch-wan @BBuf @Edwardf0t1
 /python/sglang/srt/layers/attention @merrymercy @Fridge003 @ispobock @Qiaolin-Yu @hebiao064
 /python/sglang/srt/layers/attention/fla @yizhang2077 @hebiao064

diff --git a/docs/platforms/ascend_npu_deepseek_example.md b/docs/platforms/ascend_npu_deepseek_example.md
@@ -30,7 +30,6 @@ python3 -m sglang.launch_server \
     --trust-remote-code \
     --attention-backend ascend \
     --device npu \
-    --quantization modelslim \
     --watchdog-timeout 9000 \
     --cuda-graph-bs 8 16 24 28 32 \
     --mem-fraction-static 0.68 \
@@ -88,7 +87,6 @@ python -m sglang.launch_server \
     --mem-fraction-static 0.6 \
     --attention-backend ascend \
     --device npu \
-    --quantization modelslim \
     --max-running-requests 8 \
     --context-length 8192 \
     --disable-radix-cache \
@@ -144,7 +142,6 @@ python -m sglang.launch_server \
     --max-running-requests 352 \
     --attention-backend ascend \
     --device npu \
-    --quantization modelslim \
     --moe-a2a-backend deepep \
     --enable-dp-attention \
     --deepep-mode low_latency \
@@ -216,7 +213,6 @@ do
       --mem-fraction-static 0.81 \
       --attention-backend ascend \
       --device npu \
-      --quantization modelslim \
       --max-running-requests 8 \
       --context-length 8192 \
       --disable-radix-cache \
@@ -279,7 +275,6 @@ do
       --max-running-requests 832 \
       --attention-backend ascend \
       --device npu \
-      --quantization modelslim \
       --moe-a2a-backend deepep \
       --enable-dp-attention \
       --deepep-mode low_latency \

diff --git a/docs/platforms/ascend_npu_quantization.md b/docs/platforms/ascend_npu_quantization.md
@@ -0,0 +1,21 @@
+Quantization on Ascend.
+
+To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` or `config.json` config.
+
+[ModelSlim on Ascend support](https://github.com/sgl-project/sglang/pull/14504):
+- [x] W4A4 dynamic linear
+- [x] W8A8 static linear
+- [x] W8A8 dynamic linear
+- [x] W4A8 dynamic MOE
+- [x] W8A8 dynamic MOE
+
+[AWQ on Ascend support](https://github.com/sgl-project/sglang/pull/10158):
+- [x] W4A16 linear
+- [x] W8A16 linear # Need to test
+- [x] W4A16 MOE # Need to test
+
+Compressed-tensors (LLM Compressor) on Ascend support:
+- [x] [W4A8 dynamic MOE with/without activation clip](https://github.com/sgl-project/sglang/pull/14736) # Need to test
+- [x] [W4A16 MOE](https://github.com/sgl-project/sglang/pull/12759)
+- [x] [W8A8 dynamic linear](https://github.com/sgl-project/sglang/pull/14504)
+- [x] [W8A8 dynamic MOE](https://github.com/sgl-project/sglang/pull/14504)
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -17,6 +17,7 @@
 import math
 import os
 from enum import Enum, IntEnum, auto
+from pathlib import Path
 from typing import Any, List, Optional, Set, Union
 
 import torch
@@ -632,6 +633,18 @@ def _parse_quant_hf_config(self):
                 quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
         return quant_cfg
 
+    def _find_quant_modelslim_config(self):
+        quant_config_file = Path(self.model_path, "quant_model_description.json")
+        quant_cfg = None
+        if quant_config_file.is_file():
+            with open(quant_config_file) as f:
+                quant_cfg = json.load(f)
+            # This field is required for flagless model loading but is not present in
+            # modelslim model description, so we're adding it here manually.
+            quant_cfg["quant_method"] = "modelslim"
+
+        return quant_cfg
+
     def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]:
         """Parse ModelOpt quantization config and return the appropriate quant_method."""
         json_quant_configs = quant_config_dict["quantization"]
@@ -744,6 +757,7 @@ def _verify_quantization(self) -> None:
             "w4afp8",
             "petit_nvfp4",
             "quark",
+            "modelslim",
         ]
         compatible_quantization_methods = {
             "modelopt_fp8": ["modelopt"],
@@ -755,8 +769,19 @@ def _verify_quantization(self) -> None:
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
-        # Parse quantization method from the HF model config, if available.
-        quant_cfg = self._parse_quant_hf_config()
+        # Parse quantization method from the HF and ModelSlim model config, if available.
+        # Only one function should return config, other should return None.
+        cfg_list = []
+        cfg_list.append(self._parse_quant_hf_config())
+        cfg_list.append(self._find_quant_modelslim_config())
+
+        # Filter out None values
+        cfg_list = [item for item in cfg_list if item is not None]
+        if len(cfg_list) > 1:
+            raise ValueError(
+                "Config list contains configs from 2 methods, must be only 1"
+            )
+        quant_cfg = cfg_list[0] if cfg_list else None
 
         if quant_cfg is not None:
             quant_method = quant_cfg.get(