Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_ds_v3.csv
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag
256,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,7,21.0807,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,1.0%,9.8239,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.7%,30.9046,0,3.21,45780.0,
256,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,7,26.5397,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,15.1899,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.3%,41.7296,0,4.75,33904.81,
256,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,7,40.1046,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,1.0%,20.1531,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.2%,60.2577,0,6.58,23480.44,
256,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,4,64.5498,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.8%,29.69,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.6%,94.2398,0,8.41,15014.5,Expand commentComment on line R5
256,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,157.0122,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,,0.0%,157.0122,1,10.1,9012.9,
256,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,4,181.1403,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,83.3332,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.6%,264.4735,0,11.99,5352.06,
256,64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,2,185.6359,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.7%,86.0725,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.4%,271.7084,0,23.34,5212.09,
256,128,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,2,193.3333,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.7%,87.6417,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.5%,280.975,0,45.14,5045.09,
256,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,195.8151,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,90.705,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.5%,286.5201,0,88.54,4957.06,
256,512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,203.8627,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,100.2588,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.5%,304.1215,0,166.82,4688.26,
256,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,418.9197,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,418.9197,1,242.21,3429.8,
256,2048,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,574.8491,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,574.8491,1,353.03,2537.76,
256,4096,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,799.6824,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,799.6824,1,507.54,1879.34,
256,8192,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1298.6464,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1298.6464,1,625.07,1225.08,
256,16384,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,2333.2857,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,2333.2857,1,695.8,757.35,
256,32768,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,4225.6692,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,4225.6692,1,768.4,501.56,
256,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,8,20.5773,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,11.8129,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.1%,32.3902,0,3.06,43680.27,
256,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,7,27.4554,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.9%,15.5186,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.9%,42.974,0,4.61,32923.03,
256,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,2,39.5086,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscaleSplitk_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.5%,21.3461,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,60.8547,0,6.51,23250.09,
256,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,77.4907,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,77.4907,1,10.23,18259.78,
256,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,113.4418,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,113.4418,1,13.98,12474.55,
256,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,173.2023,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,,0.0%,173.2023,1,18.31,8172.4,
256,64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,223.1917,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,,0.0%,223.1917,1,28.41,6345.07,
256,128,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,163.6759,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,86.9276,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.9%,250.6035,0,50.61,5656.52,
256,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,195.9816,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,90.1231,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.8%,286.1047,0,88.66,4964.25,
256,512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,207.1839,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,98.0931,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.7%,305.277,0,166.19,4670.52,
256,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,385.394,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,,0.0%,385.394,1,263.29,3728.16,
256,2048,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,450.5774,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,450.5774,1,450.39,3237.69,
256,4096,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,670.9652,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,670.9652,1,604.91,2239.87,
256,8192,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1266.7307,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1266.7307,1,640.82,1255.95,
256,16384,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,2251.03,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,2251.03,1,721.22,785.02,
256,32768,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,4483.8995,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,4483.8995,1,724.15,472.68,
Loading
Loading