From 1f46b7db7c1fec237182761f99187257d7962bb4 Mon Sep 17 00:00:00 2001
From: thomaswang <thomawan@amd.com>
Date: Tue, 18 Nov 2025 21:09:48 -0600
Subject: [PATCH 1/4] Remove AITER_MXFP4_MOE_SF usage and add is_shuffled flag

---
 python/sglang/srt/layers/quantization/mxfp4.py       | 12 ++++++++++--
 .../srt/layers/quantization/quark/quark_moe.py       | 11 ++++++++---
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/mxfp4.py b/python/sglang/srt/layers/quantization/mxfp4.py
index 847eaf0ee250..dc36d49245a0 100644
--- a/python/sglang/srt/layers/quantization/mxfp4.py
+++ b/python/sglang/srt/layers/quantization/mxfp4.py
@@ -43,6 +43,7 @@
     is_cuda,
     is_flashinfer_available,
     is_hip,
+    is_gfx95_supported,
     is_sm100_supported,
     is_triton_kernels_available,
     log_info_on_rank0,
@@ -72,7 +73,7 @@
     )
 
 _is_hip = is_hip()
-_is_shuffle_moe_mxfp4 = get_bool_env_var("AITER_MXFP4_MOE_SF") and _is_hip
+_is_shuffle_moe_mxfp4 = is_gfx95_supported()
 
 if _is_hip:
     # import aiter
@@ -804,14 +805,17 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         w2, w2_mx_scales = self.mxfp4_quantize(layer.w2_weight.data)
 
         # Pre-shuffle weight
-        if _is_shuffle_moe_mxfp4:
+        is_shuffled = _is_shuffle_moe_mxfp4
+        if is_shuffled:
             w13 = shuffle_weight(w13.contiguous(), (16, 16))
             w2 = shuffle_weight(w2.contiguous(), (16, 16))
 
         layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False)
+        layer.w13_weight.is_shuffled = is_shuffled
         layer.w13_weight_scale = torch.nn.Parameter(w13_mx_scales, requires_grad=False)
 
         layer.w2_weight = torch.nn.Parameter(w2, requires_grad=False)
+        layer.w2_weight.is_shuffled = is_shuffled
         layer.w2_weight_scale = torch.nn.Parameter(w2_mx_scales, requires_grad=False)
 
     def create_moe_runner(
@@ -841,6 +845,10 @@ def apply(
         else:
             w13_weight = layer.w13_weight
             w2_weight = layer.w2_weight
+        
+        if hasattr(layer.w13_weight, "is_shuffled"):
+            w13_weight.is_shuffled = True
+            w2_weight.is_shuffled = True
 
         output = fused_moe(
             x,
diff --git a/python/sglang/srt/layers/quantization/quark/quark_moe.py b/python/sglang/srt/layers/quantization/quark/quark_moe.py
index 497e69b8e679..6bb7ba9374bc 100644
--- a/python/sglang/srt/layers/quantization/quark/quark_moe.py
+++ b/python/sglang/srt/layers/quantization/quark/quark_moe.py
@@ -13,7 +13,7 @@
 from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant
 from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
 from sglang.srt.layers.quantization.utils import all_close_1d, per_tensor_dequantize
-from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs
+from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs, is_gfx95_supported
 
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.token_dispatcher import (
@@ -24,8 +24,7 @@
 
 logger = logging.getLogger(__name__)
 
-_is_hip = is_hip()
-_is_shuffle_moe_mxfp4 = get_bool_env_var("AITER_MXFP4_MOE_SF") and _is_hip
+_is_shuffle_moe_mxfp4 = is_gfx95_supported()
 
 __all__ = ["QuarkMoEMethod", "QuarkW4A4MXFp4MoEMethod"]
 
@@ -190,6 +189,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_weight.data = shuffle_weight(
                 layer.w2_weight.contiguous(), (16, 16)
             )
+            layer.w13_weight.is_shuffled = True
+            layer.w2_weight.is_shuffled = True
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
@@ -219,6 +220,10 @@ def apply(
         else:
             w13_weight = layer.w13_weight
             w2_weight = layer.w2_weight
+        
+        if hasattr(layer.w13_weight, "is_shuffled"):
+            w13_weight.is_shuffled = True
+            w2_weight.is_shuffled = True
 
         output = fused_moe(
             x,

From 1ec07e7c6f67f47e044fb766c44335b62fdbd941 Mon Sep 17 00:00:00 2001
From: thomaswang <thomawan@amd.com>
Date: Tue, 18 Nov 2025 22:05:10 -0600
Subject: [PATCH 2/4] Remove AITER_MXFP4_MOE_SF in rocm dockerfile

---
 docker/rocm.Dockerfile | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/docker/rocm.Dockerfile b/docker/rocm.Dockerfile
index 1cc106a2ec09..b50a9aca7a15 100644
--- a/docker/rocm.Dockerfile
+++ b/docker/rocm.Dockerfile
@@ -22,7 +22,6 @@ ENV BUILD_AITER_ALL="1"
 ENV BUILD_MOONCAKE="1"
 ENV AITER_COMMIT="v0.1.4"
 ENV NO_DEPS_FLAG=""
-ENV AITER_MXFP4_MOE_SF="0"
 
 # ===============================
 # Base image 942 and args
@@ -34,7 +33,6 @@ ENV BUILD_AITER_ALL="1"
 ENV BUILD_MOONCAKE="1"
 ENV AITER_COMMIT="v0.1.7.post1"
 ENV NO_DEPS_FLAG=""
-ENV AITER_MXFP4_MOE_SF="0"
 
 # ===============================
 # Base image 950 and args
@@ -44,9 +42,8 @@ ENV BUILD_TRITON="0"
 ENV BUILD_LLVM="0"
 ENV BUILD_AITER_ALL="1"
 ENV BUILD_MOONCAKE="1"
-ENV AITER_COMMIT="v0.1.7.post1"
+ENV AITER_COMMIT="169acefff8f96832625f6ea33eeb4c875b423594"
 ENV NO_DEPS_FLAG=""
-ENV AITER_MXFP4_MOE_SF="1"
 # ===============================
 # Chosen arch and args
 FROM ${GPU_ARCH}
@@ -107,8 +104,7 @@ RUN git clone ${AITER_REPO} \
  && git checkout ${AITER_COMMIT} \
  && git submodule update --init --recursive
 RUN cd aiter \
-     && if [ "$GPU_ARCH" = "gfx950" ]; then export AITER_MXFP4_MOE_SF=1; fi \
-     && echo "[AITER] GPU_ARCH=${GPU_ARCH} AITER_MXFP4_MOE_SF=${AITER_MXFP4_MOE_SF:-unset}" \
+     && echo "[AITER] GPU_ARCH=${GPU_ARCH}" \
      && if [ "$BUILD_AITER_ALL" = "1" ] && [ "$BUILD_LLVM" = "1" ]; then \
           sh -c "HIP_CLANG_PATH=/sgl-workspace/llvm-project/build/bin/ PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \
         elif [ "$BUILD_AITER_ALL" = "1" ]; then \
@@ -299,7 +295,6 @@ RUN python3 -m pip install --no-cache-dir \
 
 # -----------------------
 # Performance environment variable.
-RUN echo "AITER_MXFP4_MOE_SF=${AITER_MXFP4_MOE_SF}" >> /etc/environment
 
 ENV HIP_FORCE_DEV_KERNARG=1
 ENV HSA_NO_SCRATCH_RECLAIM=1

From 4c68966bdb547163687f450b7d911a4c2615944c Mon Sep 17 00:00:00 2001
From: thomaswang <thomawan@amd.com>
Date: Tue, 18 Nov 2025 22:45:11 -0600
Subject: [PATCH 3/4] Modify with the pre-commit fix

---
 python/sglang/srt/layers/quantization/mxfp4.py           | 5 ++---
 python/sglang/srt/layers/quantization/quark/quark_moe.py | 9 +++++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/mxfp4.py b/python/sglang/srt/layers/quantization/mxfp4.py
index dc36d49245a0..d44444a3a8c9 100644
--- a/python/sglang/srt/layers/quantization/mxfp4.py
+++ b/python/sglang/srt/layers/quantization/mxfp4.py
@@ -39,11 +39,10 @@
 from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils import (
     direct_register_custom_op,
-    get_bool_env_var,
     is_cuda,
     is_flashinfer_available,
-    is_hip,
     is_gfx95_supported,
+    is_hip,
     is_sm100_supported,
     is_triton_kernels_available,
     log_info_on_rank0,
@@ -845,7 +844,7 @@ def apply(
         else:
             w13_weight = layer.w13_weight
             w2_weight = layer.w2_weight
-        
+
         if hasattr(layer.w13_weight, "is_shuffled"):
             w13_weight.is_shuffled = True
             w2_weight.is_shuffled = True
diff --git a/python/sglang/srt/layers/quantization/quark/quark_moe.py b/python/sglang/srt/layers/quantization/quark/quark_moe.py
index 6bb7ba9374bc..e4839220103f 100644
--- a/python/sglang/srt/layers/quantization/quark/quark_moe.py
+++ b/python/sglang/srt/layers/quantization/quark/quark_moe.py
@@ -13,7 +13,12 @@
 from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant
 from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
 from sglang.srt.layers.quantization.utils import all_close_1d, per_tensor_dequantize
-from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs, is_gfx95_supported
+from sglang.srt.utils import (
+    get_bool_env_var,
+    is_gfx95_supported,
+    is_hip,
+    set_weight_attrs,
+)
 
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.token_dispatcher import (
@@ -220,7 +225,7 @@ def apply(
         else:
             w13_weight = layer.w13_weight
             w2_weight = layer.w2_weight
-        
+
         if hasattr(layer.w13_weight, "is_shuffled"):
             w13_weight.is_shuffled = True
             w2_weight.is_shuffled = True

From 8aee134d662b48c2d45bfd8aa04c701a9fd65a84 Mon Sep 17 00:00:00 2001
From: thomaswang <thomawan@amd.com>
Date: Wed, 19 Nov 2025 07:48:52 -0600
Subject: [PATCH 4/4] Turn  AITER_COMMIT to aiter tag

---
 docker/rocm.Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/rocm.Dockerfile b/docker/rocm.Dockerfile
index b50a9aca7a15..d591400c6ce1 100644
--- a/docker/rocm.Dockerfile
+++ b/docker/rocm.Dockerfile
@@ -42,7 +42,7 @@ ENV BUILD_TRITON="0"
 ENV BUILD_LLVM="0"
 ENV BUILD_AITER_ALL="1"
 ENV BUILD_MOONCAKE="1"
-ENV AITER_COMMIT="169acefff8f96832625f6ea33eeb4c875b423594"
+ENV AITER_COMMIT="v0.1.7.post2"
 ENV NO_DEPS_FLAG=""
 # ===============================
 # Chosen arch and args