From 1f46b7db7c1fec237182761f99187257d7962bb4 Mon Sep 17 00:00:00 2001 From: thomaswang Date: Tue, 18 Nov 2025 21:09:48 -0600 Subject: [PATCH 1/4] Remove AITER_MXFP4_MOE_SF usage and add is_shuffled flag --- python/sglang/srt/layers/quantization/mxfp4.py | 12 ++++++++++-- .../srt/layers/quantization/quark/quark_moe.py | 11 ++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/layers/quantization/mxfp4.py b/python/sglang/srt/layers/quantization/mxfp4.py index 847eaf0ee250..dc36d49245a0 100644 --- a/python/sglang/srt/layers/quantization/mxfp4.py +++ b/python/sglang/srt/layers/quantization/mxfp4.py @@ -43,6 +43,7 @@ is_cuda, is_flashinfer_available, is_hip, + is_gfx95_supported, is_sm100_supported, is_triton_kernels_available, log_info_on_rank0, @@ -72,7 +73,7 @@ ) _is_hip = is_hip() -_is_shuffle_moe_mxfp4 = get_bool_env_var("AITER_MXFP4_MOE_SF") and _is_hip +_is_shuffle_moe_mxfp4 = is_gfx95_supported() if _is_hip: # import aiter @@ -804,14 +805,17 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: w2, w2_mx_scales = self.mxfp4_quantize(layer.w2_weight.data) # Pre-shuffle weight - if _is_shuffle_moe_mxfp4: + is_shuffled = _is_shuffle_moe_mxfp4 + if is_shuffled: w13 = shuffle_weight(w13.contiguous(), (16, 16)) w2 = shuffle_weight(w2.contiguous(), (16, 16)) layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False) + layer.w13_weight.is_shuffled = is_shuffled layer.w13_weight_scale = torch.nn.Parameter(w13_mx_scales, requires_grad=False) layer.w2_weight = torch.nn.Parameter(w2, requires_grad=False) + layer.w2_weight.is_shuffled = is_shuffled layer.w2_weight_scale = torch.nn.Parameter(w2_mx_scales, requires_grad=False) def create_moe_runner( @@ -841,6 +845,10 @@ def apply( else: w13_weight = layer.w13_weight w2_weight = layer.w2_weight + + if hasattr(layer.w13_weight, "is_shuffled"): + w13_weight.is_shuffled = True + w2_weight.is_shuffled = True output = fused_moe( x, diff --git a/python/sglang/srt/layers/quantization/quark/quark_moe.py b/python/sglang/srt/layers/quantization/quark/quark_moe.py index 497e69b8e679..6bb7ba9374bc 100644 --- a/python/sglang/srt/layers/quantization/quark/quark_moe.py +++ b/python/sglang/srt/layers/quantization/quark/quark_moe.py @@ -13,7 +13,7 @@ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz from sglang.srt.layers.quantization.utils import all_close_1d, per_tensor_dequantize -from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs +from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs, is_gfx95_supported if TYPE_CHECKING: from sglang.srt.layers.moe.token_dispatcher import ( @@ -24,8 +24,7 @@ logger = logging.getLogger(__name__) -_is_hip = is_hip() -_is_shuffle_moe_mxfp4 = get_bool_env_var("AITER_MXFP4_MOE_SF") and _is_hip +_is_shuffle_moe_mxfp4 = is_gfx95_supported() __all__ = ["QuarkMoEMethod", "QuarkW4A4MXFp4MoEMethod"] @@ -190,6 +189,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w2_weight.data = shuffle_weight( layer.w2_weight.contiguous(), (16, 16) ) + layer.w13_weight.is_shuffled = True + layer.w2_weight.is_shuffled = True def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig @@ -219,6 +220,10 @@ def apply( else: w13_weight = layer.w13_weight w2_weight = layer.w2_weight + + if hasattr(layer.w13_weight, "is_shuffled"): + w13_weight.is_shuffled = True + w2_weight.is_shuffled = True output = fused_moe( x, From 1ec07e7c6f67f47e044fb766c44335b62fdbd941 Mon Sep 17 00:00:00 2001 From: thomaswang Date: Tue, 18 Nov 2025 22:05:10 -0600 Subject: [PATCH 2/4] Remove AITER_MXFP4_MOE_SF in rocm dockerfile --- docker/rocm.Dockerfile | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/docker/rocm.Dockerfile b/docker/rocm.Dockerfile index 1cc106a2ec09..b50a9aca7a15 100644 --- a/docker/rocm.Dockerfile +++ b/docker/rocm.Dockerfile @@ -22,7 +22,6 @@ ENV BUILD_AITER_ALL="1" ENV BUILD_MOONCAKE="1" ENV AITER_COMMIT="v0.1.4" ENV NO_DEPS_FLAG="" -ENV AITER_MXFP4_MOE_SF="0" # =============================== # Base image 942 and args @@ -34,7 +33,6 @@ ENV BUILD_AITER_ALL="1" ENV BUILD_MOONCAKE="1" ENV AITER_COMMIT="v0.1.7.post1" ENV NO_DEPS_FLAG="" -ENV AITER_MXFP4_MOE_SF="0" # =============================== # Base image 950 and args @@ -44,9 +42,8 @@ ENV BUILD_TRITON="0" ENV BUILD_LLVM="0" ENV BUILD_AITER_ALL="1" ENV BUILD_MOONCAKE="1" -ENV AITER_COMMIT="v0.1.7.post1" +ENV AITER_COMMIT="169acefff8f96832625f6ea33eeb4c875b423594" ENV NO_DEPS_FLAG="" -ENV AITER_MXFP4_MOE_SF="1" # =============================== # Chosen arch and args FROM ${GPU_ARCH} @@ -107,8 +104,7 @@ RUN git clone ${AITER_REPO} \ && git checkout ${AITER_COMMIT} \ && git submodule update --init --recursive RUN cd aiter \ - && if [ "$GPU_ARCH" = "gfx950" ]; then export AITER_MXFP4_MOE_SF=1; fi \ - && echo "[AITER] GPU_ARCH=${GPU_ARCH} AITER_MXFP4_MOE_SF=${AITER_MXFP4_MOE_SF:-unset}" \ + && echo "[AITER] GPU_ARCH=${GPU_ARCH}" \ && if [ "$BUILD_AITER_ALL" = "1" ] && [ "$BUILD_LLVM" = "1" ]; then \ sh -c "HIP_CLANG_PATH=/sgl-workspace/llvm-project/build/bin/ PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \ elif [ "$BUILD_AITER_ALL" = "1" ]; then \ @@ -299,7 +295,6 @@ RUN python3 -m pip install --no-cache-dir \ # ----------------------- # Performance environment variable. -RUN echo "AITER_MXFP4_MOE_SF=${AITER_MXFP4_MOE_SF}" >> /etc/environment ENV HIP_FORCE_DEV_KERNARG=1 ENV HSA_NO_SCRATCH_RECLAIM=1 From 4c68966bdb547163687f450b7d911a4c2615944c Mon Sep 17 00:00:00 2001 From: thomaswang Date: Tue, 18 Nov 2025 22:45:11 -0600 Subject: [PATCH 3/4] Modify with the pre-commit fix --- python/sglang/srt/layers/quantization/mxfp4.py | 5 ++--- python/sglang/srt/layers/quantization/quark/quark_moe.py | 9 +++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/layers/quantization/mxfp4.py b/python/sglang/srt/layers/quantization/mxfp4.py index dc36d49245a0..d44444a3a8c9 100644 --- a/python/sglang/srt/layers/quantization/mxfp4.py +++ b/python/sglang/srt/layers/quantization/mxfp4.py @@ -39,11 +39,10 @@ from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import ( direct_register_custom_op, - get_bool_env_var, is_cuda, is_flashinfer_available, - is_hip, is_gfx95_supported, + is_hip, is_sm100_supported, is_triton_kernels_available, log_info_on_rank0, @@ -845,7 +844,7 @@ def apply( else: w13_weight = layer.w13_weight w2_weight = layer.w2_weight - + if hasattr(layer.w13_weight, "is_shuffled"): w13_weight.is_shuffled = True w2_weight.is_shuffled = True diff --git a/python/sglang/srt/layers/quantization/quark/quark_moe.py b/python/sglang/srt/layers/quantization/quark/quark_moe.py index 6bb7ba9374bc..e4839220103f 100644 --- a/python/sglang/srt/layers/quantization/quark/quark_moe.py +++ b/python/sglang/srt/layers/quantization/quark/quark_moe.py @@ -13,7 +13,12 @@ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz from sglang.srt.layers.quantization.utils import all_close_1d, per_tensor_dequantize -from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs, is_gfx95_supported +from sglang.srt.utils import ( + get_bool_env_var, + is_gfx95_supported, + is_hip, + set_weight_attrs, +) if TYPE_CHECKING: from sglang.srt.layers.moe.token_dispatcher import ( @@ -220,7 +225,7 @@ def apply( else: w13_weight = layer.w13_weight w2_weight = layer.w2_weight - + if hasattr(layer.w13_weight, "is_shuffled"): w13_weight.is_shuffled = True w2_weight.is_shuffled = True From 8aee134d662b48c2d45bfd8aa04c701a9fd65a84 Mon Sep 17 00:00:00 2001 From: thomaswang Date: Wed, 19 Nov 2025 07:48:52 -0600 Subject: [PATCH 4/4] Turn AITER_COMMIT to aiter tag --- docker/rocm.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/rocm.Dockerfile b/docker/rocm.Dockerfile index b50a9aca7a15..d591400c6ce1 100644 --- a/docker/rocm.Dockerfile +++ b/docker/rocm.Dockerfile @@ -42,7 +42,7 @@ ENV BUILD_TRITON="0" ENV BUILD_LLVM="0" ENV BUILD_AITER_ALL="1" ENV BUILD_MOONCAKE="1" -ENV AITER_COMMIT="169acefff8f96832625f6ea33eeb4c875b423594" +ENV AITER_COMMIT="v0.1.7.post2" ENV NO_DEPS_FLAG="" # =============================== # Chosen arch and args