Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 58 additions & 58 deletions aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv

Large diffs are not rendered by default.

762 changes: 319 additions & 443 deletions aiter/configs/model_configs/dsv4_bf16_tuned_gemm.csv

Large diffs are not rendered by default.

159 changes: 87 additions & 72 deletions aiter/configs/model_configs/glm5_bf16_tuned_gemm.csv

Large diffs are not rendered by default.

178 changes: 89 additions & 89 deletions aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv

Large diffs are not rendered by default.

36 changes: 20 additions & 16 deletions aiter/configs/model_configs/kimi_bf16_tuned_gemm.csv
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
gfx,cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw
gfx950,256,64,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,opus,200,7,14.2911,opus_gemm_flatmm_splitk_256x64x64x64_2x1_16x16x32_0x0x0_wgpcu2,0.0,135.59,2201.75
gfx950,256,1024,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,51.9154,native,0.0,597.21,949.3
gfx950,256,4096,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,114.1594,native,0.0,1086.35,931.15
gfx950,256,16384,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,opus,2009,0,442.3447,opus_gemm_512x256x256x64_2x4_16x16x32_0x0x0_cA1cB17,0.0,1121.45,755.89
gfx950,256,1024,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,17.9972,native,0.0,536.95,1048.74
gfx950,256,4096,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,39.7173,native,0.0,973.25,1188.04
gfx950,256,16384,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,127.3489,native,0.0,1214.14,1259.78
gfx950,256,256,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,11155,1,5.2871,flydsl_gemm5_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp4_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,203.09,1239.55
gfx950,256,1024,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,10.1942,native,0.0,421.31,1337.18
gfx950,256,4096,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,21.2784,native,0.0,807.39,1971.16
gfx950,256,16384,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,78.0099,native,0.0,880.91,1989.35
gfx950,256,96,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,7640,1,5.1829,flydsl_gemm4_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,135.96,1700.71
gfx950,256,256,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,507,1,7.3211,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,256.66,1539.69
gfx950,256,1024,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3886,1,13.203,flydsl_gemm3_abf16_wbf16_bf16_t128x256x64_split_k1_block_m_warp2_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,569.28,1747.23
gfx950,256,4096,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,40.4141,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,743.92,1738.37
gfx950,256,16384,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,131.6552,native,0.0,913.44,1967.25
gfx950,256,1024,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,20.1108,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0104,280.3,1042.8
gfx950,256,96,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4127,4,11.0603,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0142,127.42,1469.48
gfx950,256,32,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1926,7,10.6796,flydsl_gemm6_abf16_wbf16_bf16_t32x64x64_split_k7_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0199,90.72,2890.7
gfx950,256,64,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2321,7,13.7074,flydsl_gemm5_abf16_wbf16_bf16_t64x64x64_split_k7_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0199,141.37,2295.51
gfx950,256,1024,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1542,1,68.5856,flydsl_gemm3_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp1_block_n_warp1_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,452.05,718.56
gfx950,256,4096,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,117.8273,native,0.0,1052.53,902.16
gfx950,256,16384,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,480.3821,native,0.0,1032.65,696.04
gfx950,256,1024,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,5322,1,17.6546,flydsl_gemm4_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,547.37,1069.09
gfx950,256,4096,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,40.3383,native,0.0,958.26,1169.75
gfx950,256,16384,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,130.6942,native,0.0,1183.06,1227.54
gfx950,256,256,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4620,1,5.2609,flydsl_gemm6_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp4_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,204.1,1245.72
gfx950,256,1024,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2875,1,9.5558,flydsl_gemm4_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp4_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,449.46,1426.51
gfx950,256,4096,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,20.9336,native,0.0,820.68,2003.62
gfx950,256,16384,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,80.4652,native,0.0,854.03,1928.65
gfx950,256,96,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3107,1,5.2439,flydsl_gemm4_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,134.37,1680.92
gfx950,256,128,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4843,1,5.4135,flydsl_gemm7_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,173.55,1719.06
gfx950,256,256,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2961,1,6.9191,flydsl_gemm4_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp4_block_n_warp1_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,271.57,1629.14
gfx950,256,1024,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,91,1,13.7628,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp4_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,546.12,1676.16
gfx950,256,4096,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,40.9031,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,735.02,1717.59
gfx950,256,16384,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,131.8454,native,0.0,912.12,1964.41
gfx942,80,512,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,opus,50200,6,35.8617,opus_gemm_gfx942_splitk_legacy_512x128x128x64_2x4_16x16x16_0x0x0,0.0,78.6,369.15
gfx942,80,128,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,opus,50000,0,14.1933,opus_gemm_gfx942_512x128x128x64_2x4_16x16x16_0x0x0,0.0,66.19,655.67
Loading
Loading