diff --git a/aiter/configs/model_configs/glm47_fp8_tuned_fmoe.csv b/aiter/configs/model_configs/glm47_fp8_tuned_fmoe.csv index 712e562624..592ef14033 100644 --- a/aiter/configs/model_configs/glm47_fp8_tuned_fmoe.csv +++ b/aiter/configs/model_configs/glm47_fp8_tuned_fmoe.csv @@ -47,3 +47,23 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,8192,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,1006.4097,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,,0.0%,1006.4097,1,432.1,596.81,0, 256,16384,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,1783.2552,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x192E,0.0%,0.0,,0.0%,1783.2552,1,487.72,407.38,0, 256,32768,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,3018.9246,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x192E,0.0%,0.0,,0.0%,3018.9246,1,576.19,324.0,0, +256,1,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,34.2017,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,24.8089,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.1%,59.0106,0,8.0,16392.42,0, +256,2,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,53.9247,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,34.0338,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.8%,87.9585,0,10.73,10997.71,0, +256,3,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,71.4773,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf2E,0.0%,43.4922,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.6%,114.9695,0,12.31,8414.04,0, +256,4,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,80.5077,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf2E,0.0%,49.8766,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.5%,130.3843,0,14.48,7419.4,0, +256,5,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,93.8975,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf2E,0.0%,55.7383,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.7%,149.6358,0,15.77,6464.95,0, +256,6,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,103.6197,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,60.6288,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.6%,164.2485,0,17.24,5889.88,0, +256,7,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,109.1971,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,62.836,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.1%,172.0331,0,19.2,5623.45,0, +256,8,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,112.2744,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf2E,0.0%,64.2648,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.0%,176.5392,0,21.38,5480.0,0, +256,16,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,124.2717,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf2E,0.0%,73.5111,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,197.7828,0,38.17,4892.02,0, +256,32,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,125.0859,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf2E,0.0%,74.3505,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.2%,199.4364,0,75.71,4852.69,0, +256,64,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,127.6275,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,76.0017,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,203.6292,0,148.3,4755.18,0, +256,128,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,137.2637,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,78.6014,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,215.8651,0,279.8,4490.2,0, +256,256,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,64,0,146.6375,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_64x64_2tg_pf3E,0.0%,108.8512,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,255.4887,0,472.8,3801.51,0, +256,512,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,64,0,181.9129,moe_ck2stages_gemm1_256x64x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,154.738,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,336.6509,0,717.63,2896.7,0, +256,1024,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,64,0,274.0156,moe_ck2stages_gemm1_256x64x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,254.0807,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,528.0963,0,914.95,1861.48,0, +256,2048,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,424.1973,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,465.4318,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,889.6291,0,1086.26,1122.68,0, +256,4096,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,769.8017,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,847.1011,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,1616.9028,0,1195.33,637.16,0, +256,8192,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,1594.531,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,1616.1463,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,3210.6773,0,1203.94,340.47,0, +256,16384,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,3125.3229,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,3153.9495,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,6279.2724,0,1231.18,194.13,0, +256,32768,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,6319.3863,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,6430.411,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,12749.7973,0,1212.72,115.35,0, diff --git a/aiter/configs/model_configs/glm47_fp8_untuned_fmoe.csv b/aiter/configs/model_configs/glm47_fp8_untuned_fmoe.csv index befeac9f54..2beca4f695 100644 --- a/aiter/configs/model_configs/glm47_fp8_untuned_fmoe.csv +++ b/aiter/configs/model_configs/glm47_fp8_untuned_fmoe.csv @@ -46,4 +46,24 @@ token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type, 4096,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 8192,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 16384,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 -32768,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 \ No newline at end of file +32768,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +1,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +2,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +3,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +4,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +5,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +6,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +7,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +8,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +16,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +32,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +64,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +128,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +256,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +512,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +1024,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +2048,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +4096,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +8192,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +16384,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0 +32768,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0