sgl-project · zhyncs · Apr 4, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md
@@ -178,10 +178,12 @@ python3 -m sglang.bench_one_batch_server --model None --base-url http://10.0.0.1
 
 ### Example: Serving with 8 A100/A800 with AWQ Quantization
 
-AWQ does not support BF16, so add the `--dtype half` flag if AWQ is used for quantization. One example is as follows:
+AWQ does not support BF16, so add the `--dtype half` flag if AWQ is used for quantization.
+Add `--quantization moe_wna16` flag to enable moe wna16 kernel for better performance.
+One example is as follows:
 
 ```bash
-python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --dtype half
+python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --dtype half --quantization moe_wna16
 ```
 
 

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -258,6 +258,7 @@ def _verify_quantization(self) -> None:
             "experts_int8",
             "w8a8_int8",
             "w8a8_fp8",
+            "moe_wna16",
         ]
         compatible_quantization_methods = {
             "w8a8_int8": ["compressed-tensors", "compressed_tensors"],