Skip to content
14 changes: 7 additions & 7 deletions aiter/configs/tuned_fmoe.csv
Original file line number Diff line number Diff line change
Expand Up @@ -325,13 +325,13 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,
80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,299.2146,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,268.1991,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,567.4137000000001,0,39.74,2493.4
80,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,316.0831,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,287.2443,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,603.3274,0,74.75,2354.11
80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59
80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,118.9203,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,134.5117,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,253.432,0,5.56,5562.16
80,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,190.3221,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,205.7596,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,396.0817,0,7.12,3559.81
80,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,248.4832,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,268.7225,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,517.2057,0,10.9,2727.47
80,128,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,290.4406,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,311.0546,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,601.4952000000001,0,18.74,2347.55
80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,301.7002,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,327.6233,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,629.3235,0,35.83,2248.11
80,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,309.3462,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,357.9321,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,667.2783,0,67.58,2128.49
80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,396.699,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,530.6901,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,927.3891,0,97.26,1543.37
80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,239.0773,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,239.0773,1,5.89,5896.13
80,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,361.5408,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,361.5408,1,7.8,3899.9
80,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,409.1668,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,409.1668,1,13.78,3447.65
80,128,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,513.5364,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,513.5364,1,21.95,2749.64
80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,534.8764,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,534.8764,1,42.16,2645.08
80,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,560.9956,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,560.9956,1,80.39,2531.74
80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,723.0851,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,723.0851,1,124.74,1979.44
80,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,438.8756,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,256.4253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.9%,695.3009,0,4.05,8108.15
80,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,714.79,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,400.2169,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.9%,1115.0069,0,5.06,5056.53
80,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,998.4164,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,561.412,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.9%,1559.8284,0,7.23,3615.13
Expand Down
21 changes: 14 additions & 7 deletions aiter/configs/untuned_fmoe.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1
512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0
512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0
512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Tensor,1,0
4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0
4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1
56,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0
token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1
512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0
512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0
512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Tensor,1,0
4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0
4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1
56,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0
16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0
32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0
64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0
128,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0
256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0
512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0
1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0
2 changes: 2 additions & 0 deletions hsa/gfx942/fmoe_2stages/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -1539,6 +1539,7 @@ def gen_2stages_task(self, key, blockMs):
int(q_type),
str(act_type).split(".")[-1].lower(),
doweight_stage1,
False, # bpreshuffle
)
_, ck_stage2_kernels = get_gemm2_kernels_list(
dtype2str_dict[q_dtype_a],
Expand All @@ -1547,6 +1548,7 @@ def gen_2stages_task(self, key, blockMs):
False,
int(q_type),
not doweight_stage1,
False, # bpreshuffle
)
for blockM in blockMs:
if blockM in [16, 32, 64, 128] and use_g1u1:
Expand Down
Loading