ROCm · valarLip · Nov 8, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py
@@ -3,7 +3,6 @@
 
 import functools
 import os
-import sys
 from dataclasses import dataclass
 from typing import Callable, Optional
 
@@ -619,6 +618,7 @@ def FinalFunc():
                 run_1stage = token > 32
             elif q_type != QuantType.per_1x32:
                 run_1stage = token < 256
+
         block_m = (
             BLOCK_SIZE_M
             if run_1stage

diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.cu b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.cu
@@ -32,13 +32,20 @@ MoeKernel moe_dispatch(std::string &kernelName, int block_m, int inter_dim, at::
         }
         std::cout << "[aiter] ck kernel not found: " << kernelName << std::endl;
     }
+
+    std::string moe_env_value = "0";
+    if (const char* env = std::getenv("AITER_MXFP4_MOE_SF")) {
+         moe_env_value = std::string(env);
+    }
+    bool use_mxfp4_moe_preshuffle = std::string(moe_env_value) == "1";
+
     if constexpr (stage == 1)
     {
-        return moe_stage1_heuristic_dispatch(block_m, x_dtype, w_dtype, y_dtype, act_op, quant_type, mul_routed_weight);
+        return moe_stage1_heuristic_dispatch(block_m, x_dtype, w_dtype, y_dtype, act_op, quant_type, mul_routed_weight, use_mxfp4_moe_preshuffle);
     }
     else
     {
-        return moe_stage2_heuristic_dispatch(block_m, inter_dim, x_dtype, w_dtype, y_dtype, 0, quant_type, mul_routed_weight);
+        return moe_stage2_heuristic_dispatch(block_m, inter_dim, x_dtype, w_dtype, y_dtype, 0, quant_type, mul_routed_weight, use_mxfp4_moe_preshuffle);
     }
 }
 

diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py
@@ -191,10 +191,17 @@ def name(self) -> str:
 
 # gemm1 out:bf16/fp16 A:mxfp4 B:mxfp4
 a4w4_gemm1_kernels_list= {
+     0: kernelInstanceGEMM1(       256,       32,          128,       128,     1,       4,        3,),
+     1: kernelInstanceGEMM1(       256,       64,          128,       128,     1,       4,        3,),
+     2: kernelInstanceGEMM1(       256,      128,          128,       128,     1,       4,        3,),
+    #  3: kernelInstanceGEMM1(       256,      256,         128,       128,     2,       2,        3,),
+}
+
+# bns gemm1 out:bf16/fp16 A:mxfp4 B:mxfp4
+a4w4_bns_gemm1_kernels_list= {
      0: kernelInstanceGEMM1(       256,       32,         128,       128,     1,       4,        3,),
      1: kernelInstanceGEMM1(       256,       64,          64,       128,     2,       2,        3,),
      2: kernelInstanceGEMM1(       256,      128,          64,       128,     2,       2,        3,),
-    #  3: kernelInstanceGEMM1(       256,      256,         128,       128,     2,       2,        3,),
 }
 
 gemm1_kernels_dict = {
@@ -205,6 +212,7 @@ def name(self) -> str:
     "a8w8blkscale": a8w8_gemm1_blockscale_kernels_list,
     "a8w4": a8w4_gemm1_kernels_list,
     "a4w4": a4w4_gemm1_kernels_list,
+    "a4w4_bns": a4w4_bns_gemm1_kernels_list,
 }
 
 
@@ -276,13 +284,22 @@ def name(self) -> str:
 }
 # gemm2 out:bf16/fp16 A:fp8 B:in4
 a4w4_gemm2_kernels_list= {
+     0: kernelInstanceGEMM2(       256,        32,        128,       128,     1,       4,         3,),
+     1: kernelInstanceGEMM2(       256,        64,        128,       128,     1,       4,         3,),
+     2: kernelInstanceGEMM2(       256,       128,        128,       128,     1,       4,         3,),
+     4: kernelInstanceGEMM2(        64,        32,         32,       128,     1,       1,         1,),
+     5: kernelInstanceGEMM2(        64,        64,         128,       128,     1,       1,         3,),
+     6: kernelInstanceGEMM2(        64,       128,        128,       128,     1,       1,         3,),
+    #  7: kernelInstanceGEMM2(      256,       256,         64,       128,     2,       2,         3,),
+}
+# gemm2 out:bf16/fp16 A:fp8 B:in4
+a4w4_bns_gemm2_kernels_list= {
      0: kernelInstanceGEMM2(       64,        32,         32,       128,     1,       1,         1,),
      1: kernelInstanceGEMM2(       64,        64,         64,       128,     1,       1,         1,),
      2: kernelInstanceGEMM2(       64,       128,        128,       128,     1,       1,         1,),
      4: kernelInstanceGEMM2(      256,        32,        128,       128,     1,       4,         3,),
      5: kernelInstanceGEMM2(      256,        64,         64,       128,     2,       2,         3,),
      6: kernelInstanceGEMM2(      256,       128,         64,       128,     2,       2,         3,),
-    #  7: kernelInstanceGEMM2(      256,       256,         64,       128,     2,       2,         3,),
 }
 
 # fmt: on
@@ -294,6 +311,7 @@ def name(self) -> str:
     "a8w8blkscale": a8w8_gemm2_blockscale_kernels_list,
     "a8w4": a8w4_gemm2_kernels_list,
     "a4w4": a4w4_gemm2_kernels_list,
+    "a4w4_bns": a4w4_bns_gemm2_kernels_list,
 }
 
 
@@ -312,6 +330,7 @@ def get_gemm1_kernels_list(
     ActOP: str,
     MulRoutedWeight: bool,
 ) -> list:
+    global bns_or_preslf
     arch = get_gfx()
     if Adtype in bit16_list and Bdtype in bit16_list and Adtype == Adtype:
         if arch == "gfx950":
@@ -337,7 +356,10 @@ def get_gemm1_kernels_list(
     ):
         tag = "a8w4"
     elif Adtype in bit4_list and Bdtype in bit4_list:
-        tag = "a4w4"
+        if int(os.getenv("AITER_MXFP4_MOE_SF", 0)) == 1:
+            tag = "a4w4"
+        else:
+            tag = "a4w4_bns"
     else:
         raise ValueError(f"Unsupported data type combination: {Adtype}, {Bdtype}")
     kernels_list = gemm1_kernels_dict[tag]
@@ -372,6 +394,7 @@ def get_gemm2_kernels_list(
     QuantType: str,
     MulRoutedWeight: bool,
 ) -> list:
+    global bns_or_preslf
     arch = get_gfx()
 
     if Adtype in bit16_list and Bdtype in bit16_list and Adtype == Adtype:
@@ -398,7 +421,10 @@ def get_gemm2_kernels_list(
     ):
         tag = "a8w4"
     elif Adtype in bit4_list and Bdtype in bit4_list:
-        tag = "a4w4"
+        if int(os.getenv("AITER_MXFP4_MOE_SF", 0)) == 1:
+            tag = "a4w4"
+        else:
+            tag = "a4w4_bns"
     else:
         raise ValueError(f"Unsupported data type combination: {Adtype}, {Bdtype}")
     kernels_list = gemm2_kernels_dict[tag]

diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4.cuh b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4.cuh
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: MIT
 // Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
+#include "ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp"
+// #include "ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp"
 #include "gemm_moe_ck2stages.h"
 #include <iostream>
 
@@ -89,7 +90,7 @@ void ck_moe_stage1_gemm(const hipStream_t& stream,
     static constexpr ck::index_t D1Vec               = PerTensorQuant ? 1 : EVec;
     static constexpr ck::index_t D2Vec               = 1;
 
-    using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmMXBNS
+    using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmMXBPreShuffle
         // clang-format off
 ///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     DsData|     EData|     AccData|         CShuffle|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
 ///######|         |         |         |        |       Type|       Type|       Type|      Type|        Type|         DataType| Elementwise| Elementwise|  Elementwise| Spacialization|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
@@ -104,8 +105,8 @@ void ck_moe_stage1_gemm(const hipStream_t& stream,
                 AK1,   BK1,
                 MNPerXDL,   MNPerXDL,
                 MXDLPerWave,     NXDLPerWave,
-                S<K0_A, K0_M_A, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
-                S<K0_B, K0_N_B, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
+                S<K0_A, K0_M_A, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 1,
+                S<K0_B, K0_N_B, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 1,
                 2,    CShuffleNXDLPerWave,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
                 ck::BlockGemmPipelineScheduler::Intrawave, PipelineVer, ActOP, Nswizzle, true, MulRoutedWeight, ck::index_t, A0DataType>; // clang-format on
     // clang-format on
@@ -278,7 +279,7 @@ void ck_moe_stage2_gemm(const hipStream_t& stream,
     static constexpr ck::index_t K0_M          = BLOCKSIZE / K0_A;
     static constexpr ck::index_t K0_N          = BLOCKSIZE / K0_B;
 
-    using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmMXBNS
+    using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmMXBPreShuffle
         // clang-format off
 ///#####|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     DsData|     EData|     AccData|         CShuffle|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
 ///#####|         |         |         |        |       Type|       Type|       Type|      Type|        Type|         DataType| Elementwise| Elementwise|  Elementwise| Spacialization|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
@@ -293,8 +294,8 @@ void ck_moe_stage2_gemm(const hipStream_t& stream,
                 AK1,   BK1,
                 MNPerXDL,   MNPerXDL,
                 MXDLPerWave,     NXDLPerWave,
-                S<K0_A, K0_M, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
-                S<K0_B, K0_N, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
+                S<K0_A, K0_M, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 1,
+                S<K0_B, K0_N, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 1,
                 2,    CShuffleNXDLPerWave,   S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec>,
                 ck::BlockGemmPipelineScheduler::Intrawave, PipelineVer, 0, Nswizzle, false, MulRoutedWeight, ck::index_t, A0DataType>;