ROCm · HaiShaw · Nov 7, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 5, 2025
diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py
diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json
@@ -273,14 +273,16 @@
         "srcs": [
             "f'{AITER_CSRC_DIR}/pybind/deepgemm_pybind.cu'",
             "f'{AITER_CSRC_DIR}/ck_deepgemm/deepgemm.cu'"
-
         ],
         "flags_extra_cc": [],
         "flags_extra_hip": [],
         "md_name": "'module_deepgemm'",
         "extra_ldflags": "None",
-        "extra_include": ["f'{CK_DIR}/example/ck_tile/18_flatmm'", "f'{AITER_CSRC_DIR}/ck_deepgemm/include'"],
-        "verbose": "False", 
+        "extra_include": [
+            "f'{CK_DIR}/example/ck_tile/18_flatmm'",
+            "f'{AITER_CSRC_DIR}/ck_deepgemm/include'"
+        ],
+        "verbose": "False",
         "is_python_module": "True",
         "is_standalone": "False",
         "hip_clang_path": "os.environ.get('FLATMM_HIP_CLANG_PATH')",
@@ -392,6 +394,24 @@
         "hip_clang_path": "os.environ.get('GEMM_A4W4_BLOCKWISE_HIP_CLANG_PATH')",
         "blob_gen_cmd": "f'{AITER_CSRC_DIR}/ck_gemm_moe_2stages_codegen/gen_instances.py --working_path {{}}'"
     },
+    "module_moe_cktile2stages": {
+        "srcs": [
+            "f'{AITER_CSRC_DIR}/ck_tile_gemm_moe_2stages/moe_cktile2stages.cu'",
+            "f'{AITER_CSRC_DIR}/pybind/moe_cktile_2stages_pybind.cu'"
+        ],
+        "flags_extra_cc": [],
+        "flags_extra_hip": [],
+        "md_name": "'module_moe_cktile2stages'",
+        "extra_ldflags": "None",
+        "extra_include": [
+            "f'{AITER_CSRC_DIR}/ck_tile_gemm_moe_2stages/include'"
+        ],
+        "verbose": "False",
+        "is_python_module": "True",
+        "is_standalone": "False",
+        "hip_clang_path": "os.environ.get('FLATMM_HIP_CLANG_PATH')",
+        "blob_gen_cmd": "f'{AITER_CSRC_DIR}/ck_tile_gemm_moe_2stages/gen_instances.py --working_path {{}}'"
+    },
     "module_moe_sorting": {
         "srcs": [
             "f'{AITER_CSRC_DIR}/py_itfs_ck/moe_sorting_kernels.cu'",
@@ -966,12 +986,13 @@
     "module_mla_reduce": {
         "srcs": [
             "f'{AITER_CSRC_DIR}/pybind/mla_reduce_pybind.cu'",
-            "f'{AITER_CSRC_DIR}/kernels/mla/reduce.cu'"],
+            "f'{AITER_CSRC_DIR}/kernels/mla/reduce.cu'"
+        ],
         "flags_extra_cc": [],
         "flags_extra_hip": [],
         "extra_ldflags": "None",
         "extra_include": [],
         "verbose": "False",
         "blob_gen_cmd": "''"
     }
-}
+}
diff --git a/aiter/ops/moe_op.py b/aiter/ops/moe_op.py
@@ -313,6 +313,112 @@ def ck_moe_stage2(
 ) -> None: ...
 
 
+@compile_ops("module_moe_cktile2stages", fc_name="cktile_moe_gemm1")
+def moe_cktile2stages_gemm1_ck(
+    XQ: Tensor,
+    WQ: Tensor,
+    Y: Tensor,
+    sorted_ids: Tensor,
+    sorted_expert_ids: Tensor,
+    max_token_ids: Tensor,
+    topk: int,
+    n_padded_zeros: Optional[int] = 0,
+    k_padded_zeros: Optional[int] = 0,
+    topk_weight: Optional[Tensor] = None,
+    x_scale: Optional[Tensor] = None,
+    w_scale: Optional[Tensor] = None,
+    exp_bias: Optional[Tensor] = None,
+    block_m: Optional[int] = 32,
+) -> Tensor: ...
+
+
+def moe_cktile2stages_gemm1(
+    XQ: Tensor,
+    WQ: Tensor,
+    Y: Tensor,
+    sorted_ids: Tensor,
+    sorted_expert_ids: Tensor,
+    max_token_ids: Tensor,
+    topk: int,
+    n_padded_zeros: Optional[int] = 0,
+    k_padded_zeros: Optional[int] = 0,
+    topk_weight: Optional[Tensor] = None,
+    x_scale: Optional[Tensor] = None,
+    w_scale: Optional[Tensor] = None,
+    exp_bias: Optional[Tensor] = None,
+    block_m: Optional[int] = 32,
+):
+    return moe_cktile2stages_gemm1_ck(
+        XQ,
+        WQ,
+        Y,
+        sorted_ids,
+        sorted_expert_ids,
+        max_token_ids,
+        topk,
+        n_padded_zeros,
+        k_padded_zeros,
+        topk_weight,
+        x_scale,
+        w_scale,
+        exp_bias,
+        block_m,
+    )
+
+
+@compile_ops("module_moe_cktile2stages", fc_name="cktile_moe_gemm2")
+def moe_cktile2stages_gemm2_ck(
+    XQ: Tensor,
+    WQ: Tensor,
+    Y: Tensor,
+    sorted_ids: Tensor,
+    sorted_expert_ids: Tensor,
+    max_token_ids: Tensor,
+    topk: int,
+    n_padded_zeros: Optional[int] = 0,
+    k_padded_zeros: Optional[int] = 0,
+    topk_weight: Optional[Tensor] = None,
+    x_scale: Optional[Tensor] = None,
+    w_scale: Optional[Tensor] = None,
+    exp_bias: Optional[Tensor] = None,
+    block_m: Optional[int] = 32,
+) -> Tensor: ...
+
+
+def moe_cktile2stages_gemm2(
+    XQ: Tensor,
+    WQ: Tensor,
+    Y: Tensor,
+    sorted_ids: Tensor,
+    sorted_expert_ids: Tensor,
+    max_token_ids: Tensor,
+    topk: int,
+    n_padded_zeros: Optional[int] = 0,
+    k_padded_zeros: Optional[int] = 0,
+    topk_weight: Optional[Tensor] = None,
+    x_scale: Optional[Tensor] = None,
+    w_scale: Optional[Tensor] = None,
+    exp_bias: Optional[Tensor] = None,
+    block_m: Optional[int] = 32,
+):
+    return moe_cktile2stages_gemm2_ck(
+        XQ,
+        WQ,
+        Y,
+        sorted_ids,
+        sorted_expert_ids,
+        max_token_ids,
+        topk,
+        n_padded_zeros,
+        k_padded_zeros,
+        topk_weight,
+        x_scale,
+        w_scale,
+        exp_bias,
+        block_m,
+    )
+
+
 dtype2str_dict = {
     dtypes.fp16: "f16",
     dtypes.bf16: "b16",

diff --git a/aiter/ops/shuffle.py b/aiter/ops/shuffle.py
@@ -23,3 +23,89 @@ def shuffle_weight(x: torch.Tensor, layout=(16, 16), use_int4=False) -> torch.Te
     x_ = x_.contiguous()
     x_ = x_.view(*x.shape)
     return x_.view(x_type)
+
+
+def shuffle_weight_NK(
+    x: torch.Tensor, inst_N: int, inst_K: int, use_int4=False
+) -> torch.Tensor:
+    kPerLane = inst_K // (64 // inst_N)
+    if use_int4:
+        kPerLane *= 2
+    assert (
+        x.shape[-2] % inst_N == 0
+    ), f"{x.shape[-2]} % {inst_N} == {x.shape[-2] % N_WARP_TILE }"
+    assert (
+        x.shape[-1] % inst_K == 0
+    ), f"{x.shape[-1]} % {inst_K} == {x.shape[-1] % K_WARP_TILE }"
+
+    x_ = x
+    x_ = x_.view(
+        -1, x.shape[-2] // inst_N, inst_N, x.shape[-1] // inst_K, 64 // inst_N, kPerLane
+    )
+    x_ = x_.permute(0, 1, 3, 4, 2, 5).contiguous()
+    return x_.view(*x.shape)
+
+
+def shuffle_weight_a16w4(src: torch.Tensor, NLane: int, gate_up: bool) -> torch.Tensor:
+    """
+    src: shape [experts_cnt, N, K_pk], where K_pk = K // 2
+    Returns: shuffled tensor of shape [experts_cnt, N0*2, K0, KLane, NLane, KPack]
+    """
+    # print("gemm shape:", src.shape)
+    src_type = src.dtype
+    if hasattr(torch, "float4_e2m1fn_x2") and src_type == torch.float4_e2m1fn_x2:
+        src = src.view(torch.uint8)
+    experts_cnt, N, K_pk = src.shape
+    if gate_up:
+        N = N // 2
+    KPack = 16
+    KLane = 64 // NLane  # 4
+    N0 = N // NLane
+    K0 = K_pk // (KLane * KPack)
+    if gate_up:
+        src_reshaped = src.view(
+            experts_cnt, 2, N0, NLane, K0, KLane, KPack
+        )  # [E,2, N0, NLane ,K0, KLane, KPack]
+        src_reshaped = src_reshaped.permute(
+            0, 2, 1, 4, 5, 3, 6
+        ).contiguous()  # [E, N0, 2, K0, KLane, NLane, KPack]
+        interleaved = src_reshaped.view(*src.shape)
+    else:
+        src_reshaped = src.view(experts_cnt, N0, NLane, K0, KLane, KPack)
+        interleaved = (
+            src_reshaped.permute(0, 1, 3, 4, 2, 5).contiguous().view(*src.shape)
+        )
+    # print("interleaved shape:", interleaved.shape)
+    return interleaved.contiguous().view(src_type)
+
+
+def shuffle_scale_a16w4(
+    src: torch.Tensor, experts_cnt: int, gate_up: bool
+) -> torch.Tensor:
+    n_experts, k_ = src.shape
+    n_ = n_experts // experts_cnt
+    # MXFP4 constants
+    K_Pack = 2
+    N_Pack = 2
+    N_Lane = 16
+    K_Lane = 64 // N_Lane  # 4
+
+    # Basic dimensions
+    K1 = k_ // K_Pack // K_Lane  # k_ // 8
+    N1 = n_ // N_Lane // N_Pack  # n_ // 32
+    real_k = 32 * k_ * K_Pack * K_Lane  # 1x32 quant
+    assert real_k >= 256, f"K {real_k} must be larger than Tile_K(256)"
+    # print("src shape", src.shape)
+    # Reshape based on moe_kind
+    if gate_up:
+        # Reshape to: [E, N_Pack, N1, N_Lane, K1, K_Pack, K_Lane]
+        shfl_scale = src.view(experts_cnt, N_Pack, N1, N_Lane, K1, K_Pack, K_Lane)
+        # Permute to: [E, N1, K1, K_Lane, N_Lane, K_Pack, N_Pack]
+        shfl_scale = shfl_scale.permute(0, 2, 4, 6, 3, 5, 1).contiguous()
+    else:
+        # Reshape to: [E, K1, K_Pack, K_Lane, N1, N_Pack, N_Lane]
+        shfl_scale = src.view(experts_cnt, N1, N_Pack, N_Lane, K1, K_Pack, K_Lane)
+        # Permute to: [E, N1, K1, K_Lane, N_Lane, K_Pack, N_Pack]
+        shfl_scale = shfl_scale.permute(0, 1, 4, 6, 3, 5, 2).contiguous()
+    # print("shf_scale shape:", shfl_scale.shape)
+    return shfl_scale.view(*src.shape).contiguous()