diff --git a/aiter/configs/model_configs/glm47_fp8_tuned_fmoe.csv b/aiter/configs/model_configs/glm47_fp8_tuned_fmoe.csv new file mode 100644 index 0000000000..274c78f663 --- /dev/null +++ b/aiter/configs/model_configs/glm47_fp8_tuned_fmoe.csv @@ -0,0 +1,17 @@ +cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag +256,1,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,24.9944,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.00%,20.3787,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.80%,45.3731,0,8.32,20799.41 +256,2,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,40.11,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.00%,28.5666,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.80%,68.6766,0,10.99,13741.93 +256,4,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,99.8134,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.00%,0,,0.00%,99.8134,1,15.13,9455.44 +256,8,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,134.9101,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.00%,0,,0.00%,134.9101,1,22.38,6996.08 +256,16,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,156.6731,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.00%,0,,0.00%,156.6731,1,38.55,6025.06 +256,32,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,164.2209,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.00%,0,,0.00%,164.2209,1,73.56,5749.63 +256,64,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,168.1699,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.00%,0,,0.00%,168.1699,1,143.66,5617.54 +256,128,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,177.1016,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x384E,0.00%,0,,0.00%,177.1016,1,272.83,5339.79 +256,256,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,64,0,111.1445,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_64x128_2tg_pf3E,0.00%,94.3821,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.80%,205.5266,0,470.19,4610.84 +256,512,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,128.617,_ZN5aiter45fmoe_stage1_bf16_pertokenFp8_g1u1_128x128_pf3E,0.00%,136.7373,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.80%,265.3543,0,728.36,3586.08 +256,1024,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,64,0,227.447,moe_ck2stages_gemm1_256x64x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.00%,192.093,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.80%,419.54,0,921.36,2286.9 +256,2048,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,352.8768,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.00%,345.4655,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.70%,698.3423,0,1107.04,1396.42 +256,4096,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,582.0723,moe_ck2stages_gemm1_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.00%,641.9346,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.80%,1224.0069,0,1263.22,822.41 +256,8192,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,1078.8184,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.00%,1239.192,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.80%,2318.0104,0,1334.06,461.41 +256,16384,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,1988.7154,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.00%,2328.3393,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.80%,4317.0547,0,1432.63,276.9 +256,32768,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,3930.1981,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.00%,4488.9205,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.80%,8419.1186,0,1469.22,171.87 diff --git a/aiter/configs/model_configs/glm47_fp8_untuned_fmoe.csv b/aiter/configs/model_configs/glm47_fp8_untuned_fmoe.csv new file mode 100644 index 0000000000..9f72b923bd --- /dev/null +++ b/aiter/configs/model_configs/glm47_fp8_untuned_fmoe.csv @@ -0,0 +1,17 @@ +token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1 +1,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +2,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +4,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +8,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +16,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +32,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +64,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +128,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +256,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +512,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +1024,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +2048,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +4096,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +8192,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +16384,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +32768,5120,1536,40,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0