Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions aiter/configs/model_configs/glm47_fp8_tuned_fmoe.csv
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,23 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,
256,8192,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,1006.4097,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,,0.0%,1006.4097,1,432.1,596.81,0,
256,16384,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,1783.2552,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x192E,0.0%,0.0,,0.0%,1783.2552,1,487.72,407.38,0,
256,32768,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,3018.9246,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x192E,0.0%,0.0,,0.0%,3018.9246,1,576.19,324.0,0,
256,1,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,34.2017,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,24.8089,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.1%,59.0106,0,8.0,16392.42,0,
256,2,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,53.9247,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,34.0338,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.8%,87.9585,0,10.73,10997.71,0,
256,3,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,71.4773,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf2E,0.0%,43.4922,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.6%,114.9695,0,12.31,8414.04,0,
256,4,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,80.5077,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf2E,0.0%,49.8766,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.5%,130.3843,0,14.48,7419.4,0,
256,5,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,93.8975,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf2E,0.0%,55.7383,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.7%,149.6358,0,15.77,6464.95,0,
256,6,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,103.6197,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,60.6288,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.6%,164.2485,0,17.24,5889.88,0,
256,7,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,109.1971,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,62.836,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.1%,172.0331,0,19.2,5623.45,0,
256,8,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,112.2744,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf2E,0.0%,64.2648,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.0%,176.5392,0,21.38,5480.0,0,
256,16,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,124.2717,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf2E,0.0%,73.5111,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,197.7828,0,38.17,4892.02,0,
256,32,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,125.0859,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf2E,0.0%,74.3505,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.2%,199.4364,0,75.71,4852.69,0,
256,64,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,127.6275,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,76.0017,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,203.6292,0,148.3,4755.18,0,
256,128,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,137.2637,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,78.6014,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,215.8651,0,279.8,4490.2,0,
256,256,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,64,0,146.6375,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_64x64_2tg_pf3E,0.0%,108.8512,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,255.4887,0,472.8,3801.51,0,
256,512,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,64,0,181.9129,moe_ck2stages_gemm1_256x64x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,154.738,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,336.6509,0,717.63,2896.7,0,
256,1024,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,64,0,274.0156,moe_ck2stages_gemm1_256x64x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,254.0807,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,528.0963,0,914.95,1861.48,0,
256,2048,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,424.1973,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,465.4318,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,889.6291,0,1086.26,1122.68,0,
256,4096,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,769.8017,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,847.1011,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,1616.9028,0,1195.33,637.16,0,
256,8192,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,1594.531,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,1616.1463,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,3210.6773,0,1203.94,340.47,0,
256,16384,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,3125.3229,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,3153.9495,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,6279.2724,0,1231.18,194.13,0,
256,32768,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,6319.3863,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,6430.411,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,4.3%,12749.7973,0,1212.72,115.35,0,
22 changes: 21 additions & 1 deletion aiter/configs/model_configs/glm47_fp8_untuned_fmoe.csv
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,24 @@ token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,
4096,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
8192,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
16384,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
32768,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
32768,5120,192,161,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
1,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
2,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
3,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
4,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
5,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
6,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
7,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
8,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
16,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
32,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
64,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
128,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
256,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
512,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
1024,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
2048,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
4096,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
8192,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
16384,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
32768,5120,1536,41,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0
Loading