Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,942 changes: 1,471 additions & 1,471 deletions aiter/configs/a4w4_blockscale_tuned_gemm.csv

Large diffs are not rendered by default.

118 changes: 59 additions & 59 deletions aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions aiter/configs/a8w8_blockscale_tuned_gemm.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio
256,8192,512,7168,ck,0,0,64.1614,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,937.16,1103.14,0.0
256,16384,512,7168,cktile,11,0,98.713,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1218.27,1396.85,0.0
256,20480,512,7168,cktile,27,0,95.1492,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1579.88,1801.82,0.0
256,128,1024,4096,ck,8,0,13.7599,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,78.03,361.97,0.0
256,128,4096,1280,ck,7,0,7.4194,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,180.9,870.06,0.0
gfx,cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio
gfx950,256,8192,512,7168,ck,0,0,64.1614,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,937.16,1103.14,0.0
gfx950,256,16384,512,7168,cktile,11,0,98.713,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1218.27,1396.85,0.0
gfx950,256,20480,512,7168,cktile,27,0,95.1492,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1579.88,1801.82,0.0
gfx950,256,128,1024,4096,ck,8,0,13.7599,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,78.03,361.97,0.0
gfx950,256,128,4096,1280,ck,7,0,7.4194,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,180.9,870.06,0.0
1,129 changes: 551 additions & 578 deletions aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv

Large diffs are not rendered by default.

54 changes: 27 additions & 27 deletions aiter/configs/a8w8_tuned_batched_gemm.csv
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
cu_num,B,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio
304,16,32,1280,8192,28,0,68.9821,a8w8_batched_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,155.6551,5004.8295,0.0
304,16,64,1280,8192,21,0,74.9374,a8w8_batched_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,286.5703,4736.5264,0.0
304,16,128,1280,8192,41,0,111.2581,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,386.0364,3364.6236,0.0
304,16,192,1280,8192,11,0,136.9273,a8w8_batched_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,470.5016,2875.5426,0.0
304,16,256,1280,8192,11,0,150.6582,a8w8_batched_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,570.1604,2742.2267,0.0
304,16,320,1280,8192,41,0,194.5238,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,551.9848,2223.5716,0.0
304,16,512,1280,8192,4,0,235.9793,a8w8_batched_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,728.0244,2079.5619,0.0
304,16,1024,1280,8192,4,0,457.3867,a8w8_batched_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,751.2186,1412.2029,0.0
304,16,2048,1280,8192,13,0,831.9798,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,825.9753,1149.4285,0.0
304,16,4096,1280,8192,39,0,1490.3195,a8w8_batched_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,922.2113,1058.2015,0.0
304,16,8192,1280,8192,1,0,2894.8037,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,949.5563,973.6661,0.0
304,16,16384,1280,8192,1,0,5696.639,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,965.0529,930.6541,0.0
304,16,1,8192,1024,78,0,37.703,a8w8_batched_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,7.1197,7127.5593,0.0
304,16,32,8192,1024,62,0,46.8522,a8w8_batched_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_intrawave_v2,183.3411,5930.8344,0.0
304,16,64,8192,1024,47,0,56.4451,a8w8_batched_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,304.3642,5090.0756,0.0
304,16,128,8192,1024,13,0,78.8949,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,435.5128,3880.9124,0.0
304,16,192,8192,1024,39,0,113.2351,a8w8_batched_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,455.1558,2870.6519,0.0
304,16,256,8192,1024,13,0,127.391,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,539.4375,2699.8212,0.0
304,16,320,8192,1024,13,0,172.9103,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,496.7856,2098.2399,0.0
304,16,512,8192,1024,13,0,229.5169,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,598.8184,1827.4489,0.0
304,16,1024,8192,1024,13,0,426.5342,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,644.4452,1337.3496,0.0
304,16,2048,8192,1024,13,0,823.4174,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,667.6514,1059.5055,0.0
304,16,4096,8192,1024,1,0,1583.6971,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,694.2689,932.2458,0.0
304,16,8192,8192,1024,13,0,3131.9626,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,702.1231,857.0838,0.0
304,16,16384,8192,1024,1,0,6094.2926,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,721.6665,836.8935,0.0
80,16,1,1280,8192,78,0,86.7259,a8w8_batched_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,3.869,3872.5159,0.0
gfx,cu_num,B,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio
gfx942,304,16,32,1280,8192,28,0,68.9821,a8w8_batched_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,155.6551,5004.8295,0.0
gfx942,304,16,64,1280,8192,21,0,74.9374,a8w8_batched_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,286.5703,4736.5264,0.0
gfx942,304,16,128,1280,8192,41,0,111.2581,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,386.0364,3364.6236,0.0
gfx942,304,16,192,1280,8192,11,0,136.9273,a8w8_batched_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,470.5016,2875.5426,0.0
gfx942,304,16,256,1280,8192,11,0,150.6582,a8w8_batched_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,570.1604,2742.2267,0.0
gfx942,304,16,320,1280,8192,41,0,194.5238,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,551.9848,2223.5716,0.0
gfx942,304,16,512,1280,8192,4,0,235.9793,a8w8_batched_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,728.0244,2079.5619,0.0
gfx942,304,16,1024,1280,8192,4,0,457.3867,a8w8_batched_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,751.2186,1412.2029,0.0
gfx942,304,16,2048,1280,8192,13,0,831.9798,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,825.9753,1149.4285,0.0
gfx942,304,16,4096,1280,8192,39,0,1490.3195,a8w8_batched_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,922.2113,1058.2015,0.0
gfx942,304,16,8192,1280,8192,1,0,2894.8037,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,949.5563,973.6661,0.0
gfx942,304,16,16384,1280,8192,1,0,5696.639,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,965.0529,930.6541,0.0
gfx942,304,16,1,8192,1024,78,0,37.703,a8w8_batched_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,7.1197,7127.5593,0.0
gfx942,304,16,32,8192,1024,62,0,46.8522,a8w8_batched_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_intrawave_v2,183.3411,5930.8344,0.0
gfx942,304,16,64,8192,1024,47,0,56.4451,a8w8_batched_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,304.3642,5090.0756,0.0
gfx942,304,16,128,8192,1024,13,0,78.8949,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,435.5128,3880.9124,0.0
gfx942,304,16,192,8192,1024,39,0,113.2351,a8w8_batched_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,455.1558,2870.6519,0.0
gfx942,304,16,256,8192,1024,13,0,127.391,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,539.4375,2699.8212,0.0
gfx942,304,16,320,8192,1024,13,0,172.9103,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,496.7856,2098.2399,0.0
gfx942,304,16,512,8192,1024,13,0,229.5169,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,598.8184,1827.4489,0.0
gfx942,304,16,1024,8192,1024,13,0,426.5342,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,644.4452,1337.3496,0.0
gfx942,304,16,2048,8192,1024,13,0,823.4174,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,667.6514,1059.5055,0.0
gfx942,304,16,4096,8192,1024,1,0,1583.6971,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,694.2689,932.2458,0.0
gfx942,304,16,8192,8192,1024,13,0,3131.9626,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,702.1231,857.0838,0.0
gfx942,304,16,16384,8192,1024,1,0,6094.2926,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,721.6665,836.8935,0.0
gfx942,80,16,1,1280,8192,78,0,86.7259,a8w8_batched_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,3.869,3872.5159,0.0
Loading
Loading