From efa1c7837e8cc72179f619e9f7581a6aa975a97d Mon Sep 17 00:00:00 2001 From: lalala-sh Date: Mon, 13 Apr 2026 09:31:02 +0000 Subject: [PATCH] fix split module jit and retune --- .../a8w8_blockscale_tuned_fmoe_ds_v3.csv | 32 ++++----- .../a8w8_blockscale_tuned_fmoe_glm5.csv | 34 +++++----- ...8w8_blockscale_tuned_fmoe_minimax-m2_5.csv | 66 +++++++++---------- .../a8w8_blockscale_tuned_fmoe_qwen3_235b.csv | 64 +++++++++--------- .../gemm_moe_ck2stages_common.py | 5 +- .../gemm_moe_tune.py | 8 +-- 6 files changed, 102 insertions(+), 107 deletions(-) diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_ds_v3.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_ds_v3.csv index faf7c11cff..140b891b9f 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_ds_v3.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_ds_v3.csv @@ -1,17 +1,17 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag -256,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,7,21.0807,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,1.0%,9.8239,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.7%,30.9046,0,3.21,45780.0, -256,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,7,26.5397,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,15.1899,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.3%,41.7296,0,4.75,33904.81, -256,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,7,40.1046,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,1.0%,20.1531,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.2%,60.2577,0,6.58,23480.44, -256,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,4,64.5498,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.8%,29.69,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.6%,94.2398,0,8.41,15014.5,Expand commentComment on line R5 -256,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,157.0122,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,,0.0%,157.0122,1,10.1,9012.9, -256,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,4,181.1403,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,83.3332,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.6%,264.4735,0,11.99,5352.06, -256,64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,2,185.6359,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.7%,86.0725,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.4%,271.7084,0,23.34,5212.09, -256,128,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,2,193.3333,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.7%,87.6417,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.5%,280.975,0,45.14,5045.09, -256,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,195.8151,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,90.705,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.5%,286.5201,0,88.54,4957.06, -256,512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,203.8627,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,100.2588,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.5%,304.1215,0,166.82,4688.26, -256,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,418.9197,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,418.9197,1,242.21,3429.8, -256,2048,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,574.8491,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,574.8491,1,353.03,2537.76, -256,4096,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,799.6824,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,799.6824,1,507.54,1879.34, -256,8192,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1298.6464,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1298.6464,1,625.07,1225.08, -256,16384,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,2333.2857,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,2333.2857,1,695.8,757.35, -256,32768,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,4225.6692,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,4225.6692,1,768.4,501.56, \ No newline at end of file +256,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,8,20.5773,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,11.8129,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.1%,32.3902,0,3.06,43680.27, +256,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,7,27.4554,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,15.5186,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.9%,42.974,0,4.61,32923.03, +256,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,2,39.5086,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.5%,21.3461,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,60.8547,0,6.51,23250.09, +256,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,77.4907,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,77.4907,1,10.23,18259.78, +256,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,113.4418,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,113.4418,1,13.98,12474.55, +256,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,173.2023,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,,0.0%,173.2023,1,18.31,8172.4, +256,64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,223.1917,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,,0.0%,223.1917,1,28.41,6345.07, +256,128,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,163.6759,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,86.9276,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.9%,250.6035,0,50.61,5656.52, +256,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,195.9816,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,90.1231,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.8%,286.1047,0,88.66,4964.25, +256,512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,207.1839,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,98.0931,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,305.277,0,166.19,4670.52, +256,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,385.394,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,,0.0%,385.394,1,263.29,3728.16, +256,2048,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,450.5774,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,450.5774,1,450.39,3237.69, +256,4096,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,670.9652,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,670.9652,1,604.91,2239.87, +256,8192,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1266.7307,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1266.7307,1,640.82,1255.95, +256,16384,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,2251.03,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,2251.03,1,721.22,785.02, +256,32768,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,4483.8995,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,4483.8995,1,724.15,472.68, diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_glm5.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_glm5.csv index 18aafbf2d0..571d6f7a00 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_glm5.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_glm5.csv @@ -1,17 +1,17 @@ -cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw -256,1,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,6,19.2511,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,8.356,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.7%,27.6071,0,3.08,43926.98 -256,2,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,4,24.1099,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.6%,15.3292,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.4%,39.4391,0,4.31,30749.05 -256,4,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,3,36.739,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,18.4441,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.5%,55.1831,0,6.16,21976.87 -256,8,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,79.6053,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_32x128E,0.0%,0.0,,0.0%,79.6053,1,8.54,15235.49 -256,16,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,125.4067,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,,0.0%,125.4067,1,10.84,9672.31 -256,32,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,3,156.0002,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.8%,71.8745,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.5%,227.8747,0,11.93,5324.28 -256,64,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,3,159.8955,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.8%,73.713,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.5%,233.6085,0,23.27,5196.12 -256,128,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,165.9162,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,75.0628,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.5%,240.979,0,45.11,5042.09 -256,256,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,166.6076,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,77.4096,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.5%,244.0172,0,89.11,4988.98 -256,512,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,174.0808,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,83.8513,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.4%,257.9321,0,168.6,4738.13 -256,1024,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,357.602,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,357.602,1,243.21,3443.92 -256,2048,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,500.8267,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,500.8267,1,347.32,2496.73 -256,4096,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,675.1211,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,675.1211,1,515.3,1908.07 -256,8192,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1122.5513,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1122.5513,1,619.82,1214.8 -256,16384,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1995.3897,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1995.3897,1,697.39,759.08 -256,32768,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,3628.9889,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,3628.9889,1,766.92,500.6 +cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag +256,1,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,8,21.3754,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.8%,8.8813,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,18.2%,30.2567,0,2.81,40080.27, +256,2,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,6,25.9015,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,15.9492,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.8%,41.8507,0,4.06,28977.17, +256,4,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,3,37.1216,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.7%,19.0613,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.3%,56.1829,0,6.05,21585.78, +256,8,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,68.9083,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,68.9083,1,9.86,17600.57, +256,16,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,92.8796,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,92.8796,1,14.63,13059.63, +256,32,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,139.7967,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,,0.0%,139.7967,1,19.44,8678.8, +256,64,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,188.0575,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,,0.0%,188.0575,1,28.91,6454.72, +256,128,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,213.5816,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,,0.0%,213.5816,1,50.9,5688.87, +256,256,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,165.9312,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,78.5745,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.8%,244.5057,0,88.93,4979.01, +256,512,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,176.0215,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,84.8375,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,260.859,0,166.71,4684.97, +256,1024,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,333.3869,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,,0.0%,333.3869,1,260.88,3694.06, +256,2048,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,386.3245,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,386.3245,1,450.26,3236.73, +256,4096,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,581.2317,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,581.2317,1,598.54,2216.29, +256,8192,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1089.5555,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1089.5555,1,638.6,1251.59, +256,16384,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1941.8567,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1941.8567,1,716.62,780.01, +256,32768,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,3768.5665,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,3768.5665,1,738.51,482.06, diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_minimax-m2_5.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_minimax-m2_5.csv index 79fd1a3279..d781b6b689 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_minimax-m2_5.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_minimax-m2_5.csv @@ -1,33 +1,33 @@ -cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw -256,1,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,4,19.8736,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.8%,8.5938,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,28.4674,0,3.98,63649.95 -256,2,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,24.6209,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,14.2644,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.3%,38.8853,0,5.82,46597.5 -256,4,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,37.6495,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,18.6115,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.4%,56.261,0,8.05,32206.61 -256,8,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,66.8547,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,34.1756,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.0%,101.0303,0,8.97,17935.34 -256,16,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,114.0522,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,58.9743,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,173.0265,0,10.47,10472.89 -256,32,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,213.9812,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,105.9787,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,319.9599,0,11.33,5663.94 -256,64,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,211.9831,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,107.5922,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,319.5753,0,22.68,5671.68 -256,128,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,212.9532,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,110.8636,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,323.8168,0,44.76,5599.21 -256,256,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,219.7848,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,113.6637,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,333.4485,0,86.94,5441.02 -256,512,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,225.8961,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,119.4503,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,345.3464,0,167.9,5260.39 -256,1024,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,222.2637,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,130.3546,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,352.6183,0,328.87,5165.29 -256,2048,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,303.1189,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,177.779,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,480.8979,0,482.28,3807.07 -256,4096,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,497.602,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,300.5426,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,798.1446,0,581.17,2317.48 -256,8192,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,881.8032,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,608.3583,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,1490.1615,0,622.56,1266.6 -256,16384,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,1510.9624,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,1180.8653,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,2691.8277,0,689.28,729.22 -256,32768,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,2877.4131,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,2360.7656,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,5238.1787,0,708.42,403.56 -256,1,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,8,15.8809,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,1.2%,7.998,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,23.8789,0,2.37,37940.56 -256,2,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,3,18.9289,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,9.3371,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.5%,28.266,0,4.01,32052.22 -256,4,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,2,27.2864,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.8%,12.5222,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,39.8086,0,5.69,22759.07 -256,8,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,48.3709,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_32x128E,0.0%,0.0,,0.0%,48.3709,1,9.36,18731.17 -256,16,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,89.7646,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_32x128E,0.0%,0.0,,0.0%,89.7646,1,10.09,10094.37 -256,32,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,162.3447,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,162.3447,1,11.16,5582.35 -256,64,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,163.6424,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,163.6424,1,22.15,5539.88 -256,128,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,164.0217,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,164.0217,1,44.19,5530.67 -256,256,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,112.4586,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,62.0607,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.5%,174.5193,0,83.06,5204.75 -256,512,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,169.9644,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,169.9644,1,170.57,5358.11 -256,1024,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,177.6108,_ZN5aiter45fmoe_bf16_blockscaleFp8_g1u1_novs_silu_32x384E,0.0%,0.0,,0.0%,177.6108,1,326.46,5154.0 -256,2048,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,293.2635,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,293.2635,1,395.43,3153.63 -256,4096,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,257.3805,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,237.7241,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,495.1046,0,468.44,1906.1 -256,8192,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,477.7597,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,461.0584,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,938.8181,0,494.09,1045.43 -256,16384,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,1536.1909,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,1536.1909,1,603.9,688.04 -256,32768,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,1489.228,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,1606.6029,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,3095.8309,0,599.33,390.19 +cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag +256,1,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,32.3244,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,32.3244,1,3.5,56055.13, +256,2,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,34.174,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,34.174,1,6.63,53021.53, +256,4,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,43.1377,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,43.1377,1,10.5,42004.47, +256,8,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,76.084,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,,0.0%,76.084,1,11.91,23815.95, +256,16,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,128.9085,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,128.9085,1,14.06,14057.16, +256,32,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,199.0606,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,,0.0%,199.0606,1,18.2,9103.93, +256,64,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,270.3214,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,270.3214,1,26.81,6705.09, +256,128,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,304.3742,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,304.3742,1,47.62,5956.87, +256,256,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,319.7141,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,,0.0%,319.7141,1,90.68,5674.75, +256,512,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,328.2534,_ZN5aiter45fmoe_bf16_blockscaleFp8_g1u1_novs_silu_32x384E,0.0%,0.0,,0.0%,328.2534,1,176.64,5534.32, +256,1024,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,388.7551,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,3.0%,0.0,,0.0%,388.7551,1,298.3,4685.15, +256,2048,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,490.4502,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,3.8%,0.0,,0.0%,490.4502,1,472.89,3732.92, +256,4096,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,726.1677,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,3.1%,0.0,,0.0%,726.1677,1,638.77,2547.19, +256,8192,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1266.8399,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,3.3%,0.0,,0.0%,1266.8399,1,732.3,1489.88, +256,16384,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,2448.0967,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,3.4%,0.0,,0.0%,2448.0967,1,757.91,801.82, +256,32768,3072,768,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,4726.883,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,3.3%,0.0,,0.0%,4726.883,1,785.05,447.21, +256,1,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,6,18.3474,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.7%,8.0831,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,26.4305,0,2.14,34277.78, +256,2,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,31.9843,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,31.9843,1,3.54,28326.03, +256,4,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,34.8166,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,34.8166,1,6.51,26022.26, +256,8,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,42.4861,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,42.4861,1,10.66,21325.64, +256,16,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,81.3054,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,81.3054,1,11.14,11144.61, +256,32,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,102.7335,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,102.7335,1,17.64,8821.51, +256,64,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,135.6244,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,135.6244,1,26.72,6684.34, +256,128,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,158.1712,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,158.1712,1,45.82,5735.24, +256,256,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,112.7897,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,63.1919,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,175.9816,0,82.37,5161.5, +256,512,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,168.8228,_ZN5aiter45fmoe_bf16_blockscaleFp8_g1u1_novs_silu_32x384E,0.0%,0.0,,0.0%,168.8228,1,171.72,5394.34, +256,1024,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,148.331,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,84.4899,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,232.8209,0,249.04,3931.81, +256,2048,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,346.6301,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,346.6301,1,334.55,2668.1, +256,4096,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,312.9021,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,255.6016,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,568.5037,0,407.96,1660.0, +256,8192,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,553.1389,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,463.1474,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,1016.2863,0,456.42,965.74, +256,16384,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,1550.7237,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,1550.7237,1,598.25,681.59, +256,32768,3072,384,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,2759.5336,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,2759.5336,1,672.37,437.74, diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_qwen3_235b.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_qwen3_235b.csv index 7b6678b8df..fbd41983b7 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_qwen3_235b.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_qwen3_235b.csv @@ -1,33 +1,33 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag -256,1,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,50.5701,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_32x128E,0.0%,0.0,,0.0%,50.5701,1,5.97,5971.95, -256,2,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,62.2735,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_32x128E,0.0%,0.0,,0.0%,62.2735,1,9.7,4849.81, -256,4,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,63.3083,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_32x128E,0.0%,0.0,,0.0%,63.3083,1,19.08,4770.92, -256,8,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,71.1117,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_32x128E,0.0%,0.0,,0.0%,71.1117,1,33.97,4248.08, -256,16,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,47.3525,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,25.3391,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,72.6916,0,66.47,4157.1, -256,32,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,48.9565,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,26.1604,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,75.1169,0,128.65,4025.5, -256,64,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,50.3719,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,27.6135,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,77.9854,0,247.83,3882.47, -256,128,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,71.4342,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,38.2975,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,109.7317,0,352.27,2766.41, -256,256,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,87.2548,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,60.4363,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.8%,147.6911,0,523.45,2066.04, -256,512,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,140.538,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,99.8574,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,240.3954,0,643.19,1282.39, -256,1024,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,243.9233,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,182.5911,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,426.5144,0,725.03,737.54, -256,2048,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,448.7192,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,340.3445,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,789.0637,0,783.81,414.61, -256,4096,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,857.1214,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,660.3874,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,1517.5088,0,815.12,232.17, -256,8192,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,1680.3975,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,1300.7456,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,2981.1431,0,829.85,135.07, -256,16384,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,3320.1753,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,2589.1816,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,5909.3569,0,837.28,85.17, -256,32768,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,6631.5753,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,5181.274,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,11812.8493,0,837.7,59.65, -256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,8,17.7723,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,1.3%,8.5972,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,18.6%,26.3695,0,2.86,22904.95, -256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,8,24.7086,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,1.2%,10.0603,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.9%,34.7689,0,4.34,17371.97, -256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,49.2139,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_32x128E,0.0%,0.0,,0.0%,49.2139,1,6.14,12273.54, -256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,59.6442,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_32x128E,0.0%,0.0,,0.0%,59.6442,1,10.13,10128.03, -256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,114.8168,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,114.8168,1,10.52,5262.09, -256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,116.9941,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,116.9941,1,20.65,5165.84, -256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,122.7549,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_32x128E,0.0%,0.0,,0.0%,122.7549,1,39.36,4926.62, -256,128,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,125.6835,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_32x128E,0.0%,0.0,,0.0%,125.6835,1,76.89,4818.08, -256,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,85.5637,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,48.1136,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,133.6773,0,144.58,4541.72, -256,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,88.1634,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,54.4834,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,142.6468,0,270.98,4278.2, -256,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,126.4899,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,85.014,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,211.5039,0,365.52,2915.14, -256,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,191.2118,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,166.2391,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,357.4509,0,432.56,1760.09, -256,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,315.8578,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,311.874,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,627.7318,0,492.63,1042.34, -256,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,541.1465,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,555.3761,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,1096.5226,0,564.03,642.62, -256,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,936.6648,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,1089.5139,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,2026.1787,0,610.48,397.45, -256,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,1781.4846,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,2148.8869,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.6%,3930.3715,0,629.43,256.12, \ No newline at end of file +256,1,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,44.4733,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,44.4733,1,6.79,6790.64, +256,2,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,49.078,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,49.078,1,12.31,6153.76, +256,4,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,56.6031,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,56.6031,1,21.34,5336.09, +256,8,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,65.7221,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,65.7221,1,36.76,4596.45, +256,16,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,47.4816,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,25.0412,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,14.1%,72.5228,0,66.63,4166.78, +256,32,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,48.5349,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,25.8063,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,14.4%,74.3412,0,129.99,4067.5, +256,64,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,61.2206,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,29.5277,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,14.4%,90.7483,0,212.98,3336.44, +256,128,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,67.5755,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,40.3167,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,14.4%,107.8922,0,358.27,2813.57, +256,256,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,142.1512,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,9.6%,0.0,,0.0%,142.1512,1,543.85,2146.56, +256,512,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,152.7835,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,96.2901,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,14.5%,249.0736,0,620.78,1237.71, +256,1024,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,258.0873,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,169.0166,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,14.6%,427.1039,0,724.03,736.53, +256,2048,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,468.703,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,308.263,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,14.4%,776.966,0,796.01,421.07, +256,4096,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,891.308,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,579.9584,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,14.5%,1471.2664,0,840.74,239.47, +256,8192,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,2840.6239,_ZN5aiter45fmoe_bf16_blockscaleFp8_g1u1_novs_silu_32x384E,0.0%,0.0,,0.0%,2840.6239,1,870.9,141.75, +256,16384,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,5217.5794,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,5217.5794,1,948.29,96.47, +256,32768,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,10403.6177,_ZN5aiter45fmoe_bf16_blockscaleFp8_g1u1_novs_silu_32x384E,0.0%,0.0,,0.0%,10403.6177,1,951.17,67.73, +256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,8,19.0419,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,1.1%,8.3931,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,27.435,0,2.75,22015.38, +256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,4,23.3333,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,1.0%,13.9733,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,37.3066,0,4.05,16190.28, +256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,42.2041,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,42.2041,1,7.16,14312.09, +256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,48.6586,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,48.6586,1,12.41,12414.62, +256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,73.5923,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,73.5923,1,16.41,8209.78, +256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,110.8486,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,110.8486,1,21.79,5452.24, +256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,120.831,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,120.831,1,39.99,5005.06, +256,128,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,122.4552,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,122.4552,1,78.92,4945.1, +256,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,83.462,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,45.7736,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,129.2356,0,149.55,4697.82, +256,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,110.1493,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,58.3348,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,168.4841,0,229.43,3622.13, +256,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,154.4034,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,87.1208,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,241.5242,0,320.09,2552.8, +256,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,209.3299,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,170.4266,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,379.7565,0,407.15,1656.71, +256,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,384.5051,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,308.9218,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,693.4269,0,445.96,943.59, +256,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,646.6441,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,527.7251,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1174.3692,0,526.64,600.02, +256,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,1808.4732,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,1808.4732,1,683.98,445.3, +256,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,3449.1606,_ZN5aiter43fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384E,0.0%,0.0,,0.0%,3449.1606,1,717.25,291.85, diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py index f28a15227a..a5e96fa4a3 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +import copy from dataclasses import dataclass import os import sys @@ -402,7 +403,7 @@ def get_gemm1_kernels_list( else: raise ValueError(f"Unsupported data type combination: {Adtype}, {Bdtype}") - kernels_list = gemm1_kernels_dict[tag] + kernels_list = {k: copy.deepcopy(v) for k, v in gemm1_kernels_dict[tag].items()} for id, kernel in kernels_list.items(): kernel.MulRoutedWeight = MulRoutedWeight kernel.ActOP = ActOP == "silu" @@ -472,7 +473,7 @@ def get_gemm2_kernels_list( tag = "a4w4_bns" else: raise ValueError(f"Unsupported data type combination: {Adtype}, {Bdtype}") - kernels_list = gemm2_kernels_dict[tag] + kernels_list = {k: copy.deepcopy(v) for k, v in gemm2_kernels_dict[tag].items()} for id, kernel in kernels_list.items(): kernel.MulRoutedWeight = MulRoutedWeight kernel.Nswizzle = Nswizzle diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py index 633b25c7e2..7ee5da016f 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py @@ -586,13 +586,7 @@ def generate_data( else: w1_qt = w1_qt.view(w1.shape[0], w1.shape[1], w1.shape[2] // 2) w2_qt = w2_qt.view(w2.shape[0], w2.shape[1], w2.shape[2] // 2) - score = torch.zeros((token, expert), dtype=dtype) - start_col = 0 - end_col = topk - for token_id in range(token): - score[token_id, start_col:end_col] = 1.0 - start_col = end_col % expert - end_col = start_col + topk + score = torch.randn((token, expert), dtype=dtype) topk_weights, topk_ids = fused_topk(input, score, topk, True) if q_type == QuantType.per_1x128: a1_qt, a1_scale = aiter.pertoken_quant(