diff --git a/aiter/configs/a8w8_blockscale_tuned_gemm.csv b/aiter/configs/a8w8_blockscale_tuned_gemm.csv index 33ec51f766..21584aaa94 100755 --- a/aiter/configs/a8w8_blockscale_tuned_gemm.csv +++ b/aiter/configs/a8w8_blockscale_tuned_gemm.csv @@ -117,122 +117,83 @@ cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio 304,20480,512,7168,2,0,256.7931,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,585.39,667.63,0.0 304,20480,4096,512,0,0,183.337,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,468.53,983.74,0.0 256,16,1536,7168,8,0,20.8535,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.9,535.83,0.0 -256,16,3072,1536,8,0,7.66,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.71,632.05,0.0 256,16,576,7168,8,0,19.9031,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.64,214.13,0.0 256,16,7168,256,8,0,3.6287,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.18,570.03,0.0 -256,16,7168,2048,8,0,8.0688,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,58.22,1851.85,0.0 256,16,4608,7168,8,0,21.2231,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.8,1568.68,0.0 256,16,7168,2304,8,0,8.6748,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,60.92,1934.49,0.0 256,16,512,7168,8,0,19.9419,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.89,190.61,0.0 -256,16,4096,512,13,0,3.5903,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,18.69,622.91,0.0 256,32,1536,7168,8,0,20.7843,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,33.9,545.49,0.0 -256,32,3072,1536,8,0,7.8864,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,38.29,629.48,0.0 256,32,576,7168,8,0,19.9519,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.24,220.28,0.0 256,32,7168,256,7,0,3.6839,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,31.88,624.87,0.0 -256,32,7168,2048,8,0,8.2088,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,114.45,1852.2,0.0 256,32,4608,7168,8,0,21.1971,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,99.73,1582.97,0.0 256,32,7168,2304,8,0,8.942,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,118.2,1906.46,0.0 256,32,512,7168,8,0,19.9219,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.79,197.38,0.0 -256,32,4096,512,7,0,3.8435,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,34.92,618.1,0.0 256,64,1536,7168,8,0,20.5651,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,68.53,567.24,0.0 -256,64,3072,1536,18,0,8.078,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,74.77,644.98,0.0 256,64,576,7168,8,0,19.7455,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,26.76,236.07,0.0 256,64,7168,256,11,0,3.7547,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,62.56,737.45,0.0 -256,64,7168,2048,7,0,9.2928,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,202.2,1692.56,0.0 256,64,4608,7168,7,0,24.0952,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,175.46,1414.34,0.0 256,64,7168,2304,7,0,10.0892,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,209.52,1742.46,0.0 256,64,512,7168,8,0,19.6915,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.86,213.0,0.0 -256,64,4096,512,13,0,4.0395,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,66.45,657.06,0.0 256,128,1536,7168,8,0,20.1019,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,140.21,612.92,0.0 -256,128,3072,1536,7,0,7.8296,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,154.28,728.21,0.0 256,128,576,7168,8,0,19.3303,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.68,268.68,0.0 256,128,7168,256,7,0,3.9827,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,117.95,929.72,0.0 -256,128,7168,2048,18,0,10.1248,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,371.18,1657.04,0.0 256,128,4608,7168,18,0,24.7832,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,341.19,1417.38,0.0 256,128,7168,2304,18,0,11.0009,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,384.32,1694.86,0.0 256,128,512,7168,8,0,19.0891,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.22,247.19,0.0 -256,128,4096,512,8,0,4.3587,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,123.17,736.75,0.0 256,256,1536,7168,7,0,23.4904,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,239.98,580.3,0.0 -256,256,3072,1536,12,0,8.9712,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,269.3,745.13,0.0 256,256,576,7168,8,0,19.6847,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,107.39,317.95,0.0 256,256,7168,256,18,0,5.6643,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,165.87,983.45,0.0 -256,256,7168,2048,18,0,14.8749,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,505.29,1268.87,0.0 256,256,4608,7168,18,0,36.4007,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,464.59,1022.63,0.0 256,256,7168,2304,18,0,15.9582,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,529.87,1301.83,0.0 256,256,512,7168,8,0,19.4919,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,96.4,295.88,0.0 -256,256,4096,512,13,0,5.096,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,210.7,848.78,0.0 256,512,1536,7168,18,0,24.5324,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,459.57,662.51,0.0 -256,512,3072,1536,18,0,11.8645,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,407.25,729.13,0.0 256,512,576,7168,18,0,23.3536,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,181.04,359.2,0.0 256,512,7168,256,13,0,8.214,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,228.76,1132.96,0.0 -256,512,7168,2048,18,0,25.7748,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,583.22,895.01,0.0 256,512,4608,7168,18,0,58.822,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,575.0,704.14,0.0 256,512,7168,2304,18,0,28.4393,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,594.65,880.29,0.0 256,512,512,7168,8,0,20.1543,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,186.47,390.21,0.0 -256,512,4096,512,3,0,8.2148,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,261.42,797.78,0.0 256,1024,1536,7168,18,0,36.4647,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,618.37,589.5,0.0 -256,1024,3072,1536,18,0,17.5794,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,549.72,715.78,0.0 256,1024,576,7168,18,0,23.9404,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,353.2,528.33,0.0 256,1024,7168,256,16,0,12.6589,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,296.87,1325.33,0.0 -256,1024,7168,2048,0,0,46.861,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,641.57,671.29,0.0 256,1024,4608,7168,18,0,99.4691,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,680.07,500.73,0.0 256,1024,7168,2304,0,0,49.965,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,676.93,671.56,0.0 256,1024,512,7168,7,0,23.5068,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,319.75,512.98,0.0 -256,1024,4096,512,18,0,11.0785,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,387.68,993.82,0.0 256,1536,1536,7168,18,0,58.096,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,582.19,460.25,0.0 -256,1536,3072,1536,18,0,25.5804,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,566.66,645.61,0.0 256,1536,576,7168,18,0,25.1476,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,504.37,672.36,0.0 256,1536,7168,256,18,0,16.5106,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,341.43,1468.65,0.0 -256,1536,7168,2048,0,0,66.7266,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,675.85,597.15,0.0 256,1536,4608,7168,18,0,139.826,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,725.68,416.2,0.0 256,1536,7168,2304,0,0,73.8284,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,687.19,569.89,0.0 256,1536,512,7168,18,0,24.7692,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,455.17,656.17,0.0 -256,1536,4096,512,18,0,14.5922,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,441.5,1059.92,0.0 256,2048,1536,7168,18,0,64.5154,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,699.01,495.72,0.0 -256,2048,3072,1536,2,0,31.1726,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,620.01,655.94,0.0 256,2048,576,7168,18,0,35.1627,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,480.95,602.01,0.0 256,2048,7168,256,11,0,21.3779,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,351.59,1483.75,0.0 -256,2048,7168,2048,0,0,84.8351,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,708.78,568.57,0.0 256,2048,4608,7168,18,0,180.0698,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,751.33,369.77,0.0 256,2048,7168,2304,0,0,92.2125,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,733.59,548.67,0.0 256,2048,512,7168,18,0,25.826,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,582.06,791.73,0.0 -256,2048,4096,512,18,0,18.1866,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,472.32,1095.47,0.0 256,4096,1536,7168,18,0,124.0493,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,727.08,426.87,0.0 -256,4096,3072,1536,0,0,56.5736,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,683.26,639.45,0.0 256,4096,576,7168,18,0,57.2232,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,591.07,667.69,0.0 256,4096,7168,256,18,0,37.2575,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,403.47,1653.46,0.0 -256,4096,7168,2048,0,0,150.9763,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,796.54,541.73,0.0 256,4096,4608,7168,0,0,310.9487,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,870.19,322.04,0.0 256,4096,7168,2304,0,0,161.8222,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,836.05,523.24,0.0 256,4096,512,7168,18,0,44.5137,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,675.4,836.25,0.0 -256,4096,4096,512,18,0,31.979,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,537.22,1180.42,0.0 256,8192,1536,7168,0,0,210.661,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,856.3,450.47,0.0 -256,8192,3072,1536,0,0,103.7592,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,745.08,651.83,0.0 256,8192,576,7168,18,0,101.6527,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,665.46,711.11,0.0 256,8192,7168,256,17,0,73.9508,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,406.55,1641.26,0.0 -256,8192,7168,2048,0,0,269.0296,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,894.02,553.46,0.0 256,8192,4608,7168,0,0,539.5795,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1002.94,309.96,0.0 256,8192,7168,2304,0,0,285.6152,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,947.37,535.09,0.0 256,8192,512,7168,18,0,86.0755,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,698.57,822.29,0.0 -256,8192,4096,512,18,0,63.5698,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,540.5,1154.64,0.0 256,16384,1536,7168,0,0,371.4478,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,971.27,481.31,0.0 -256,16384,3072,1536,0,0,186.8656,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,827.43,698.62,0.0 256,16384,576,7168,18,0,186.7632,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,724.4,751.99,0.0 256,16384,7168,256,17,0,141.6637,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,424.45,1700.58,0.0 -256,16384,7168,2048,0,0,508.5984,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,945.81,556.66,0.0 256,16384,4608,7168,0,0,1046.6928,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1034.05,288.02,0.0 256,16384,7168,2304,0,0,539.2895,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1003.48,536.16,0.0 256,16384,512,7168,0,0,143.7706,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,836.47,959.08,0.0 -256,16384,4096,512,0,0,119.5724,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,574.71,1210.17,0.0 256,20480,1536,7168,0,0,464.4401,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,971.0,475.25,0.0 -256,20480,3072,1536,0,0,223.3253,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,865.43,725.42,0.0 256,20480,576,7168,18,0,230.9895,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,732.13,755.54,0.0 256,20480,7168,256,18,0,167.1547,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,449.65,1798.81,0.0 -256,20480,7168,2048,0,0,630.5854,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,953.55,555.4,0.0 256,20480,4608,7168,0,0,1309.3769,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1033.25,281.49,0.0 256,20480,7168,2304,0,0,670.7196,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1008.55,532.71,0.0 256,20480,512,7168,2,0,193.5078,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,776.84,885.97,0.0 -256,20480,4096,512,18,0,144.807,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,593.2,1245.49,0.0 80,16,1536,7168,8,0,23.9251,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.73,467.04,0.0 80,16,3072,1536,8,0,8.4118,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,17.95,575.56,0.0 80,16,576,7168,8,0,23.4551,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.63,181.7,0.0 diff --git a/aiter/configs/a8w8_blockscale_untuned_gemm.csv b/aiter/configs/a8w8_blockscale_untuned_gemm.csv index 95119f5113..945b54f5b9 100644 --- a/aiter/configs/a8w8_blockscale_untuned_gemm.csv +++ b/aiter/configs/a8w8_blockscale_untuned_gemm.csv @@ -1,118 +1,79 @@ M,N,K 16, 1536, 7168 -16, 3072, 1536 16, 576, 7168 16, 7168, 256 -16, 7168, 2048 16, 4608, 7168 16, 7168, 2304 16, 512, 7168 -16, 4096, 512 32, 1536, 7168 -32, 3072, 1536 32, 576, 7168 32, 7168, 256 -32, 7168, 2048 32, 4608, 7168 32, 7168, 2304 32, 512, 7168 -32, 4096, 512 64, 1536, 7168 -64, 3072, 1536 64, 576, 7168 64, 7168, 256 -64, 7168, 2048 64, 4608, 7168 64, 7168, 2304 64, 512, 7168 -64, 4096, 512 128, 1536, 7168 -128, 3072, 1536 128, 576, 7168 128, 7168, 256 -128, 7168, 2048 128, 4608, 7168 128, 7168, 2304 128, 512, 7168 -128, 4096, 512 256, 1536, 7168 -256, 3072, 1536 256, 576, 7168 256, 7168, 256 -256, 7168, 2048 256, 4608, 7168 256, 7168, 2304 256, 512, 7168 -256, 4096, 512 512, 1536, 7168 -512, 3072, 1536 512, 576, 7168 512, 7168, 256 -512, 7168, 2048 512, 4608, 7168 512, 7168, 2304 512, 512, 7168 -512, 4096, 512 1024, 1536, 7168 -1024, 3072, 1536 1024, 576, 7168 1024, 7168, 256 -1024, 7168, 2048 1024, 4608, 7168 1024, 7168, 2304 1024, 512, 7168 -1024, 4096, 512 1536, 1536, 7168 -1536, 3072, 1536 1536, 576, 7168 1536, 7168, 256 -1536, 7168, 2048 1536, 4608, 7168 1536, 7168, 2304 1536, 512, 7168 -1536, 4096, 512 2048, 1536, 7168 -2048, 3072, 1536 2048, 576, 7168 2048, 7168, 256 -2048, 7168, 2048 2048, 4608, 7168 2048, 7168, 2304 2048, 512, 7168 -2048, 4096, 512 4096, 1536, 7168 -4096, 3072, 1536 4096, 576, 7168 4096, 7168, 256 -4096, 7168, 2048 4096, 4608, 7168 4096, 7168, 2304 4096, 512, 7168 -4096, 4096, 512 8192, 1536, 7168 -8192, 3072, 1536 8192, 576, 7168 8192, 7168, 256 -8192, 7168, 2048 8192, 4608, 7168 8192, 7168, 2304 8192, 512, 7168 -8192, 4096, 512 16384, 1536, 7168 -16384, 3072, 1536 16384, 576, 7168 16384, 7168, 256 -16384, 7168, 2048 16384, 4608, 7168 16384, 7168, 2304 16384, 512, 7168 -16384, 4096, 512 20480, 1536, 7168 -20480, 3072, 1536 20480, 576, 7168 20480, 7168, 256 -20480, 7168, 2048 20480, 4608, 7168 20480, 7168, 2304 20480, 512, 7168 -20480, 4096, 512 diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_ds_v3.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_ds_v3.csv new file mode 100644 index 0000000000..4d3d45ab3a --- /dev/null +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_ds_v3.csv @@ -0,0 +1,68 @@ +cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio +256,1,2112,7168,8,0,23.1423,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.31,654.65,0.0 +256,2,2112,7168,8,0,23.1821,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.61,654.02,0.0 +256,4,2112,7168,8,0,23.0544,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.25,658.63,0.0 +256,8,2112,7168,8,0,23.2634,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.41,654.67,0.0 +256,16,2112,7168,8,0,20.2497,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.92,756.61,0.0 +256,32,2112,7168,8,0,20.0548,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,48.31,773.05,0.0 +256,64,2112,7168,8,0,19.8696,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,97.52,798.6,0.0 +256,128,2112,7168,8,0,20.2426,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,191.45,819.9,0.0 +256,256,2112,7168,18,0,22.1462,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,350.0,815.27,0.0 +256,512,2112,7168,18,0,29.0228,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,534.14,722.59,0.0 +256,1024,2112,7168,18,0,44.836,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,691.5,597.83,0.0 +256,2048,2112,7168,18,0,74.5268,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,832.03,516.19,0.0 +256,4096,2112,7168,0,0,124.1349,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,999.05,497.85,0.0 +256,8192,2112,7168,0,0,198.3448,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1250.52,546.84,0.0 +256,16384,2112,7168,0,0,351.5932,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1410.92,573.92,0.0 +256,32768,2112,7168,0,0,706.8872,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1403.53,549.5,0.0 +256,1,3072,1536,8,0,7.4103,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.27,637.8,0.0 +256,2,3072,1536,8,0,7.4266,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.54,637.43,0.0 +256,4,3072,1536,13,0,7.8966,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,4.78,601.44,0.0 +256,8,3072,1536,18,0,7.9267,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,9.52,603.03,0.0 +256,16,3072,1536,8,0,7.4157,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.36,652.87,0.0 +256,32,3072,1536,8,0,7.0426,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,42.88,704.9,0.0 +256,64,3072,1536,8,0,7.5522,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,79.97,689.88,0.0 +256,128,3072,1536,8,0,8.1319,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,148.55,701.14,0.0 +256,256,3072,1536,18,0,7.6382,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,316.29,875.16,0.0 +256,512,3072,1536,18,0,10.6592,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,453.3,811.58,0.0 +256,1024,3072,1536,18,0,15.1443,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,638.11,830.87,0.0 +256,2048,3072,1536,2,0,23.8432,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,810.6,857.57,0.0 +256,4096,3072,1536,0,0,37.7108,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1025.03,959.3,0.0 +256,8192,3072,1536,0,0,66.4578,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1163.29,1017.69,0.0 +256,16384,3072,1536,0,0,125.1134,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1235.83,1043.44,0.0 +256,20480,3072,1536,0,0,223.3253,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,865.43,725.42,0.0 +256,32768,3072,1536,0,0,227.0388,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1362.05,1129.22,0.0 +256,1,4096,512,8,0,3.8518,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.09,546.72,0.0 +256,2,4096,512,8,0,3.8129,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.2,554.58,0.0 +256,4,4096,512,13,0,3.4246,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,4.9,622.55,0.0 +256,8,4096,512,8,0,3.8517,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.71,562.55,0.0 +256,16,4096,512,8,0,3.835,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,17.5,583.16,0.0 +256,32,4096,512,11,0,3.8544,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,34.82,616.36,0.0 +256,64,4096,512,6,0,4.0748,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,65.88,651.37,0.0 +256,128,4096,512,11,0,4.4446,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,120.79,722.51,0.0 +256,256,4096,512,12,0,4.6202,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,232.4,936.19,0.0 +256,512,4096,512,18,0,7.2254,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,297.21,907.02,0.0 +256,1024,4096,512,0,0,9.5064,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,451.8,1158.17,0.0 +256,2048,4096,512,0,0,14.9624,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,574.1,1331.53,0.0 +256,4096,4096,512,0,0,25.9147,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,662.94,1456.65,0.0 +256,8192,4096,512,0,0,45.4707,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,755.65,1614.23,0.0 +256,16384,4096,512,0,0,85.3201,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,805.43,1696.01,0.0 +256,20480,4096,512,18,0,144.807,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,593.2,1245.49,0.0 +256,32768,4096,512,0,0,159.2565,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,863.0,1804.07,0.0 +256,1,7168,2048,8,0,8.5724,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.42,1714.39,0.0 +256,2,7168,2048,8,0,8.6049,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.82,1709.82,0.0 +256,4,7168,2048,8,0,8.188,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.34,1800.88,0.0 +256,8,7168,2048,8,0,8.3523,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,28.12,1773.3,0.0 +256,16,7168,2048,8,0,8.3043,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,56.57,1799.33,0.0 +256,32,7168,2048,7,0,8.4862,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,110.71,1791.66,0.0 +256,64,7168,2048,8,0,8.825,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,212.92,1782.28,0.0 +256,128,7168,2048,18,0,9.0903,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,413.42,1845.62,0.0 +256,256,7168,2048,18,0,13.0936,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,574.04,1441.5,0.0 +256,512,7168,2048,0,0,19.376,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,775.83,1190.58,0.0 +256,1024,7168,2048,0,0,31.2424,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,962.31,1006.88,0.0 +256,2048,7168,2048,0,0,56.3503,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1067.07,855.98,0.0 +256,4096,7168,2048,0,0,98.6453,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1219.11,829.12,0.0 +256,8192,7168,2048,0,0,178.4711,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1347.66,834.3,0.0 +256,16384,7168,2048,0,0,330.8182,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1454.08,855.8,0.0 +256,20480,7168,2048,0,0,630.5854,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,953.55,555.4,0.0 +256,32768,7168,2048,0,0,649.7976,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1480.57,848.8,0.0 diff --git a/aiter/configs/model_configs/a8w8_blockscale_untuned_gemm_ds_v3.csv b/aiter/configs/model_configs/a8w8_blockscale_untuned_gemm_ds_v3.csv new file mode 100644 index 0000000000..7394571299 --- /dev/null +++ b/aiter/configs/model_configs/a8w8_blockscale_untuned_gemm_ds_v3.csv @@ -0,0 +1,68 @@ +M,N,K +1, 2112, 7168 +2, 2112, 7168 +4, 2112, 7168 +8, 2112, 7168 +16, 2112, 7168 +32, 2112, 7168 +64, 2112, 7168 +128, 2112, 7168 +256, 2112, 7168 +512, 2112, 7168 +1024, 2112, 7168 +2048, 2112, 7168 +4096, 2112, 7168 +8192, 2112, 7168 +16384, 2112, 7168 +32768, 2112, 7168 +1, 3072, 1536 +2, 3072, 1536 +4, 3072, 1536 +8, 3072, 1536 +16, 3072, 1536 +32, 3072, 1536 +64, 3072, 1536 +128, 3072, 1536 +256, 3072, 1536 +512, 3072, 1536 +1024, 3072, 1536 +2048, 3072, 1536 +4096, 3072, 1536 +8192, 3072, 1536 +16384, 3072, 1536 +20480, 3072, 1536 +32768, 3072, 1536 +1, 4096, 512 +2, 4096, 512 +4, 4096, 512 +8, 4096, 512 +16, 4096, 512 +32, 4096, 512 +64, 4096, 512 +128, 4096, 512 +256, 4096, 512 +512, 4096, 512 +1024, 4096, 512 +2048, 4096, 512 +4096, 4096, 512 +8192, 4096, 512 +16384, 4096, 512 +20480, 4096, 512 +32768, 4096, 512 +1, 7168, 2048 +2, 7168, 2048 +4, 7168, 2048 +8, 7168, 2048 +16, 7168, 2048 +32, 7168, 2048 +64, 7168, 2048 +128, 7168, 2048 +256, 7168, 2048 +512, 7168, 2048 +1024, 7168, 2048 +2048, 7168, 2048 +4096, 7168, 2048 +8192, 7168, 2048 +16384, 7168, 2048 +20480, 7168, 2048 +32768, 7168, 2048 \ No newline at end of file