sgl-project · HaiShaw · Jan 14, 2026 · May 12, 2025 · Jun 20, 2025 · Jun 20, 2025
diff --git a/docs/advanced_features/quantization.md b/docs/advanced_features/quantization.md
@@ -353,6 +353,8 @@ python3 -m sglang.launch_server \
 
 Our team is working on supporting more online quantization methods. SGLang will soon support methods including but not limited to `["awq", "gptq", "marlin", "gptq_marlin", "awq_marlin", "bitsandbytes", "gguf"]`.
 
+### torchao online quantization method
+
 SGLang also supports quantization methods based on [torchao](https://github.com/pytorch/ao). You can simply specify `--torchao-config` in the command line to support this feature. For example, if you want to enable `int4wo-128` for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command:
 
 ```bash
@@ -374,6 +376,12 @@ python3 -m sglang.launch_server \
     --port 30000 --host 0.0.0.0
 ```
 
+### `quark_int4fp8_moe` online quantization method
+
+SGLang running on AMD GPUs (CDNA3 or CDNA4 architecture) supports the quantization method `--quantization quark_int4fp8_moe`, that will replace [MoE layers](https://github.com/sgl-project/sglang/blob/v0.4.8/python/sglang/srt/layers/moe/fused_moe_triton/layer.py#L271) originally in high precision (bfloat16, float16 or float32) to use weights dynamically quantized to int4, that are upcasted to float8 during inference to run compute in float8 precision with activations dynamically quantized on the fly to float8.
+
+Other layers (e.g. projections in the attention layers) have their weights quantized online to float8 directly.
+
 ## Reference
 
 - [GPTQModel](https://github.com/ModelCloud/GPTQModel)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -716,6 +716,7 @@ def _verify_quantization(self) -> None:
             "quark",
             "mxfp4",
             "auto-round",
+            "quark_int4fp8_moe",
         ]
         optimized_quantization_methods = [
             "fp8",

@@ -0,0 +1,73 @@
+"""
+Common utilities for quark.
+"""
+
+import logging
+from typing import Tuple
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def quantize_fp8_scale_tensorwise(w: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    FP8_MAX = 448.0
+    scale = w.abs().amax().float() / FP8_MAX
+    scaled = (w / scale).clamp(-FP8_MAX, FP8_MAX).to(torch.float8_e4m3fn)
+    return scaled, scale
+
+
+def quantize_int4_scale_columnwise(
+    w: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    S4_MAX = 7
+    w_flat = w.reshape(-1, w.shape[-1]).float()
+    scale = w_flat.abs().amax(axis=-1) / S4_MAX
+    scaled = torch.round(w_flat / scale[:, None]).to(torch.int8).clamp(-S4_MAX, S4_MAX)
+    return scaled.reshape(w.shape), scale.reshape(w.shape[:-1])
+
+
+def pack_int4_to_int32(to_pack: torch.Tensor, reorder: bool = True) -> torch.Tensor:
+    if to_pack.ndim > 2:
+        raise ValueError(
+            "Pack: Only supports tensors with dimensions not greater than 2."
+        )
+
+    if reorder:
+        order_map = [0, 2, 4, 6, 1, 3, 5, 7]
+    else:
+        order_map = [0, 1, 2, 3, 4, 5, 6, 7]
+    pack_num = 8
+    if to_pack.ndim == 2:
+        packed = torch.zeros(
+            to_pack.shape[0],
+            to_pack.shape[1] // pack_num,
+            dtype=torch.int32,
+            device=to_pack.device,
+        )
+        new_c = to_pack.shape[1] // pack_num
+        for c in range(new_c):
+            for i in range(pack_num):
+                # Use -3 as an example, high_position is 11111111,cause bit_or generate errors, so we can't use int4 directly
+                packed_col = to_pack[:, c * pack_num + order_map[i]].to(torch.int32)
+                packed_col = packed_col & 0x0F
+                packed[:, c] = torch.bitwise_or(
+                    packed[:, c], torch.bitwise_left_shift(packed_col, i * 4)
+                )
+    elif to_pack.ndim == 0:
+        packed = to_pack.to(torch.int32)
+    else:
+        packed = torch.zeros(
+            to_pack.shape[0] // pack_num, dtype=torch.int32, device=to_pack.device
+        )
+        new_c = to_pack.shape[0] // pack_num
+        for c in range(new_c):
+            for i in range(pack_num):
+                # Use -3 as an example, high_position is 11111111,cause bit_or generate errors, so we can't use int4 directly
+                packed_col = to_pack[c * pack_num + order_map[i]]
+                packed_col = packed_col & 0x0F
+                packed[c] = torch.bitwise_or(
+                    packed[c], torch.bitwise_left_shift(packed_col, i * 4)
+                )
+
+    return packed.view(torch.uint32)
@@ -66,6 +66,7 @@
     "ModelOptFp4LinearMethod",
     "IPEXAWQLinearMethod",
     "PetitNvFp4LinearMethod",
+    "QuarkInt4Fp8LinearMethod",
 ]
 
 _is_cpu = is_cpu()

@@ -36,6 +36,7 @@ def override_quantization_method(self, *args, **kwargs):
 from sglang.srt.layers.quantization.petit import PetitNvFp4Config
 from sglang.srt.layers.quantization.qoq import QoQConfig
 from sglang.srt.layers.quantization.quark.quark import QuarkConfig
+from sglang.srt.layers.quantization.quark_int4fp8_moe import QuarkInt4Fp8Config
 from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
 from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
 from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
@@ -68,6 +69,7 @@ def override_quantization_method(self, *args, **kwargs):
     "fbgemm_fp8": FBGEMMFp8Config,
     "quark": QuarkConfig,
     "auto-round": AutoRoundConfig,
+    "quark_int4fp8_moe": QuarkInt4Fp8Config,
 }