diff --git a/aiter/configs/tuned_fmoe.csv b/aiter/configs/tuned_fmoe.csv index 8fb7ac4634..bca541c83e 100644 --- a/aiter/configs/tuned_fmoe.csv +++ b/aiter/configs/tuned_fmoe.csv @@ -764,4 +764,11 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,597.1745,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,373.7546,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,970.9291,0,46.45,2908.63 80,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,734.3088,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,509.0543,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1243.3631,0,72.54,2275.75 80,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08 -80,56,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,228.7482,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.5%,0.0,Null,0.0%,228.7482,1,73.93,2644.88 \ No newline at end of file +80,56,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,228.7482,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.5%,0.0,Null,0.0%,228.7482,1,73.93,2644.88 +80,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,245.0416,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,245.0416,1,6.47,5775.08 +80,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,370.7841,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,370.7841,1,8.55,3817.53 +80,64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,428.9409,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,428.9409,1,14.78,3301.54 +80,128,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,519.4664,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,519.4664,1,24.42,2728.85 +80,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,536.7655,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,536.7655,1,47.26,2646.03 +80,512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,560.4425,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,560.4425,1,90.53,2544.06 +80,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,827.4898,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,827.4898,1,122.62,1736.35