From e6337771d2e3355022bff0a4fd150c3145c35003 Mon Sep 17 00:00:00 2001 From: solin Date: Mon, 3 Nov 2025 05:47:28 +0000 Subject: [PATCH 01/13] cktile bpreshuffle && tuning code --- .../a8w8_bpreshuffle_cktile_tuned_gemm.csv | 244 +++++++++++ .../a8w8_bpreshuffle_cktile_untuned_gemm.csv | 244 +++++++++++ aiter/jit/core.py | 9 + aiter/jit/optCompilerConfig.json | 37 ++ aiter/ops/gemm_op_a8w8.py | 67 +++ csrc/cktile_gemm_a8w8_bpreshuffle/README.md | 18 + .../gemm_a8w8_bpreshuffle_cktile.cu | 193 +++++++++ .../gemm_a8w8_bpreshuffle_cktile_common.py | 372 +++++++++++++++++ .../gemm_a8w8_bpreshuffle_cktile_tune.cu | 102 +++++ .../gemm_a8w8_bpreshuffle_cktile_tune.py | 169 ++++++++ .../gen_instances.py | 281 +++++++++++++ .../include/gemm_a8w8_bpreshuffle_cktile.h | 20 + .../gemm_a8w8_bpreshuffle_cktile_common.cuh | 390 ++++++++++++++++++ .../gemm_a8w8_bpreshuffle_cktile_pybind.cu | 15 + ...emm_a8w8_bpreshuffle_cktile_tune_pybind.cu | 17 + csrc/rocm_ops.cpp | 3 +- op_tests/testflatmm.py | 165 ++++++++ 17 files changed, 2345 insertions(+), 1 deletion(-) create mode 100644 aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv create mode 100755 aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv create mode 100644 csrc/cktile_gemm_a8w8_bpreshuffle/README.md create mode 100755 csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu create mode 100644 csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py create mode 100644 csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu create mode 100755 csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py create mode 100755 csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py create mode 100644 csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile.h create mode 100644 csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh create mode 100644 csrc/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu create mode 100644 csrc/pybind/gemm_a8w8_bpreshuffle_cktile_tune_pybind.cu create mode 100755 op_tests/testflatmm.py diff --git a/aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv b/aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv new file mode 100644 index 0000000000..3ff676d588 --- /dev/null +++ b/aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv @@ -0,0 +1,244 @@ +cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio +80,1,9216,4096,30,0,12.9048,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,5.85,2926.92,0.0 +80,2,9216,4096,2,0,12.9388,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,11.67,2920.97,0.0 +80,4,9216,4096,2,0,12.9816,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,23.26,2914.81,0.0 +80,8,9216,4096,30,0,13.1572,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,45.9,2882.75,0.0 +80,16,9216,4096,2,0,14.0139,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,86.2,2719.38,0.0 +80,32,9216,4096,9,0,16.7096,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,144.58,2302.25,0.0 +80,64,9216,4096,22,0,24.5395,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x256_1x4x1_16x16x64_default,196.9,1597.04,0.0 +80,128,9216,4096,24,0,35.4998,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,272.22,1144.58,0.0 +80,256,9216,4096,68,0,63.803,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x256_1x4x1_16x16x64_default,302.92,682.04,0.0 +80,1024,9216,4096,54,0,211.9977,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,364.67,286.88,0.0 +80,2048,9216,4096,54,0,410.3007,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,376.84,204.45,0.0 +80,4096,9216,4096,54,0,786.7339,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,393.07,165.27,0.0 +80,4240,9216,4096,54,0,829.5777,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,385.87,160.64,0.0 +80,16384,9216,4096,54,0,3084.9868,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,400.96,131.88,0.0 +80,32768,9216,4096,54,0,6135.9085,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,403.18,126.46,0.0 +80,1,4608,4096,30,0,9.9687,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,3.79,1894.7,0.0 +80,2,4608,4096,2,0,9.8532,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,7.66,1918.26,0.0 +80,4,4608,4096,2,0,10.3186,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,14.63,1834.32,0.0 +80,8,4608,4096,30,0,10.2518,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,29.46,1851.47,0.0 +80,16,4608,4096,9,0,10.1797,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,59.33,1875.04,0.0 +80,32,4608,4096,37,0,11.0049,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,109.77,1753.8,0.0 +80,64,4608,4096,23,0,15.8202,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,152.71,1246.91,0.0 +80,128,4608,4096,24,0,22.8823,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,211.16,899.31,0.0 +80,256,4608,4096,24,0,35.1223,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,275.14,634.42,0.0 +80,1024,4608,4096,26,0,111.4377,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x128_1x4x1_16x16x64_default,346.87,291.7,0.0 +80,2048,4608,4096,54,0,210.7297,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,366.87,218.94,0.0 +80,4096,4608,4096,54,0,408.7369,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,378.28,179.58,0.0 +80,16384,4608,4096,54,0,1545.3929,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,400.21,153.34,0.0 +80,32768,4608,4096,54,0,3099.2964,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,399.11,146.83,0.0 +80,1,1280,8192,2,0,12.7965,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,1.64,820.26,0.0 +80,32,1280,8192,2,0,13.6423,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,49.19,793.84,0.0 +80,64,1280,8192,2,0,13.4425,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,99.85,831.24,0.0 +80,128,1280,8192,9,0,16.5571,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,162.13,716.43,0.0 +80,192,1280,8192,23,0,24.9874,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,161.14,502.26,0.0 +80,256,1280,8192,23,0,25.3858,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,211.48,521.48,0.0 +80,320,1280,8192,23,0,36.0155,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,186.33,386.68,0.0 +80,512,1280,8192,24,0,39.0062,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,275.27,409.95,0.0 +80,1024,1280,8192,24,0,62.422,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,344.03,344.36,0.0 +80,2048,1280,8192,68,0,115.5764,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x256_1x4x1_16x16x64_default,371.61,281.25,0.0 +80,4096,1280,8192,68,0,224.3581,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x256_1x4x1_16x16x64_default,382.87,243.03,0.0 +80,8192,1280,8192,48,0,434.5678,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,395.33,226.81,0.0 +80,16384,1280,8192,48,0,855.908,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,401.44,218.07,0.0 +80,1,8192,1024,33,0,6.6064,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x256_1x4x1_16x16x64_default,2.54,1272.4,0.0 +80,32,8192,1024,37,0,6.9748,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,76.97,1282.57,0.0 +80,64,8192,1024,23,0,10.4742,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,102.51,907.25,0.0 +80,128,8192,1024,24,0,14.6067,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,147.02,726.85,0.0 +80,192,8192,1024,51,0,18.4464,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x256_1x4x1_16x16x64_default,174.63,635.95,0.0 +80,256,8192,1024,46,0,22.7833,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,188.51,563.79,0.0 +80,320,8192,1024,48,0,24.6382,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,217.9,566.57,0.0 +80,512,8192,1024,46,0,37.6643,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,228.07,459.36,0.0 +80,1024,8192,1024,46,0,65.3665,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,262.82,401.04,0.0 +80,2048,8192,1024,41,0,117.7909,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,291.7,373.88,0.0 +80,4096,8192,1024,41,0,222.3966,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,309.0,358.33,0.0 +80,8192,8192,1024,41,0,423.1597,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,324.79,356.83,0.0 +80,16384,8192,1024,41,0,821.258,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,334.7,357.5,0.0 +80,16,1536,7168,2,0,11.9166,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,29.57,937.67,0.0 +80,32,1536,7168,2,0,12.2456,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,57.54,925.86,0.0 +80,64,1536,7168,9,0,14.8021,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,95.21,788.09,0.0 +80,128,1536,7168,21,0,20.5898,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,136.89,598.39,0.0 +80,256,1536,7168,66,0,29.7058,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x192x256_1x4x1_16x16x64_default,189.77,458.88,0.0 +80,512,1536,7168,58,0,44.6328,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,252.6,364.15,0.0 +80,1024,1536,7168,58,0,77.1886,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,292.12,278.48,0.0 +80,1536,1536,7168,20,0,101.7773,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x256x128_1x4x1_16x16x64_default,332.32,262.72,0.0 +80,2048,1536,7168,24,0,130.8105,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,344.75,244.49,0.0 +80,4096,1536,7168,48,0,242.9667,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,371.22,217.94,0.0 +80,8192,1536,7168,54,0,461.2919,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,391.05,205.72,0.0 +80,16384,1536,7168,54,0,905.9909,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,398.21,197.33,0.0 +80,20480,1536,7168,48,0,1118.0539,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,403.35,197.42,0.0 +80,16,3072,1536,2,0,5.9618,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,25.33,812.08,0.0 +80,32,3072,1536,9,0,6.7498,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,44.74,735.48,0.0 +80,64,3072,1536,21,0,8.9754,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,67.29,580.49,0.0 +80,128,3072,1536,21,0,11.1207,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,108.62,512.7,0.0 +80,256,3072,1536,57,0,15.8689,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x192x256_1x4x1_16x16x64_default,152.24,421.24,0.0 +80,512,3072,1536,23,0,23.6895,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,203.97,365.17,0.0 +80,1024,3072,1536,46,0,39.0277,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,247.61,322.41,0.0 +80,1536,3072,1536,54,0,50.9012,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,284.78,324.45,0.0 +80,2048,3072,1536,48,0,65.8281,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,293.6,310.62,0.0 +80,4096,3072,1536,54,0,117.4866,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,329.01,307.91,0.0 +80,8192,3072,1536,54,0,225.0856,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,343.47,300.48,0.0 +80,16384,3072,1536,54,0,438.9518,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,352.25,297.41,0.0 +80,20480,3072,1536,54,0,542.6733,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,356.15,298.53,0.0 +80,16,576,7168,2,0,12.0528,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,10.96,353.6,0.0 +80,32,576,7168,30,0,12.3251,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,21.44,356.59,0.0 +80,64,576,7168,2,0,12.2075,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,43.29,381.83,0.0 +80,128,576,7168,2,0,12.3514,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,85.57,420.5,0.0 +80,256,576,7168,9,0,14.9376,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,141.52,418.99,0.0 +80,512,576,7168,9,0,22.5703,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,187.32,371.67,0.0 +80,1024,576,7168,22,0,35.4728,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x256_1x4x1_16x16x64_default,238.37,356.57,0.0 +80,1536,576,7168,58,0,44.4162,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,285.56,380.68,0.0 +80,2048,576,7168,131,0,65.7335,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x192x256_1x4x1_16x16x64_default,257.27,322.03,0.0 +80,4096,576,7168,74,0,107.8111,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_160x192x128_1x4x1_16x16x64_default,313.72,354.39,0.0 +80,8192,576,7168,58,0,186.7817,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,362.16,387.01,0.0 +80,16384,576,7168,54,0,359.7223,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,376.1,390.42,0.0 +80,20480,576,7168,54,0,436.2202,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,387.68,400.08,0.0 +80,16,7168,2048,2,0,9.1742,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,51.2,1628.72,0.0 +80,32,7168,2048,49,0,10.7453,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,87.44,1414.98,0.0 +80,64,7168,2048,49,0,13.6786,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,137.37,1149.87,0.0 +80,128,7168,2048,23,0,19.541,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,192.32,858.56,0.0 +80,256,7168,2048,18,0,30.4226,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,247.06,620.41,0.0 +80,512,7168,2048,48,0,52.0579,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,288.76,443.13,0.0 +80,1024,7168,2048,48,0,96.1383,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,312.72,327.21,0.0 +80,1536,7168,2048,46,0,135.3627,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,333.16,294.36,0.0 +80,2048,7168,2048,48,0,180.4336,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,333.25,267.33,0.0 +80,4096,7168,2048,41,0,338.9949,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,354.75,241.27,0.0 +80,8192,7168,2048,41,0,651.1562,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,369.37,228.67,0.0 +80,16384,7168,2048,41,0,1282.8862,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,374.96,220.69,0.0 +80,20480,7168,2048,41,0,1602.6372,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,375.19,218.53,0.0 +80,16,4608,7168,9,0,14.6651,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,72.07,2270.17,0.0 +80,32,4608,7168,9,0,14.9106,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,141.77,2250.37,0.0 +80,64,4608,7168,23,0,23.5169,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,179.78,1449.12,0.0 +80,128,4608,7168,24,0,35.1222,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,240.75,1000.15,0.0 +80,256,4608,7168,24,0,55.8852,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,302.61,666.09,0.0 +80,512,4608,7168,0,0,103.0634,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x128_1x4x1_16x16x64_default,328.18,401.88,0.0 +80,1024,4608,7168,26,0,184.555,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x128_1x4x1_16x16x64_default,366.53,269.88,0.0 +80,1536,4608,7168,18,0,272.9456,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,371.75,213.21,0.0 +80,2048,4608,7168,54,0,354.6475,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,381.48,187.75,0.0 +80,4096,4608,7168,54,0,692.6798,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,390.63,144.57,0.0 +80,8192,4608,7168,48,0,1343.0841,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,402.93,124.53,0.0 +80,16384,4608,7168,54,0,2639.293,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,410.08,114.22,0.0 +80,20480,4608,7168,54,0,3276.6577,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,412.89,112.48,0.0 +80,16,7168,2304,21,0,11.0122,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,47.99,1523.88,0.0 +80,32,7168,2304,49,0,11.1539,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,94.76,1528.39,0.0 +80,64,7168,2304,49,0,14.327,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,147.55,1227.06,0.0 +80,128,7168,2304,23,0,21.2273,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,199.17,878.35,0.0 +80,256,7168,2304,46,0,31.7244,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,266.54,654.86,0.0 +80,512,7168,2304,48,0,56.153,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,301.17,445.83,0.0 +80,1024,7168,2304,48,0,103.9455,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,325.39,322.81,0.0 +80,1536,7168,2304,46,0,144.8029,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,350.37,290.56,0.0 +80,2048,7168,2304,46,0,193.0686,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,350.37,262.05,0.0 +80,4096,7168,2304,41,0,366.5239,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,369.12,231.01,0.0 +80,8192,7168,2304,41,0,709.1492,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,381.56,215.51,0.0 +80,16384,7168,2304,41,0,1404.7256,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,385.25,205.84,0.0 +80,20480,7168,2304,41,0,1741.0107,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,388.54,205.23,0.0 +80,16,512,7168,30,0,11.9822,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,9.8,317.23,0.0 +80,32,512,7168,2,0,12.2803,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,19.13,320.2,0.0 +80,64,512,7168,30,0,12.315,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,38.15,340.58,0.0 +80,128,512,7168,2,0,12.3314,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,76.19,382.65,0.0 +80,256,512,7168,9,0,14.8513,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,126.52,388.33,0.0 +80,512,512,7168,37,0,22.2836,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,168.65,352.92,0.0 +80,1024,512,7168,24,0,34.5858,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,217.32,348.66,0.0 +80,1536,512,7168,99,0,47.427,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x128x256_1x4x1_16x16x64_default,237.72,342.69,0.0 +80,2048,512,7168,24,0,54.6059,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,275.29,374.45,0.0 +80,4096,512,7168,18,0,101.3206,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,296.73,367.39,0.0 +80,8192,512,7168,24,0,180.3432,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,333.42,392.47,0.0 +80,16384,512,7168,24,0,332.6272,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,361.54,414.54,0.0 +80,20480,512,7168,48,0,387.5191,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,387.91,442.41,0.0 +80,16,4096,512,30,0,3.9474,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,17.0,566.55,0.0 +80,32,4096,512,49,0,4.2976,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,31.23,552.79,0.0 +80,64,4096,512,49,0,5.4494,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,49.26,487.06,0.0 +80,128,4096,512,49,0,6.8127,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,78.8,471.36,0.0 +80,256,4096,512,15,0,10.6842,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x128_1x4x1_16x16x64_default,100.5,404.84,0.0 +80,512,4096,512,45,0,15.5946,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,137.71,420.25,0.0 +80,1024,4096,512,46,0,25.069,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,171.33,439.19,0.0 +80,1536,4096,512,46,0,33.2391,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,193.82,465.31,0.0 +80,2048,4096,512,46,0,42.7751,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,200.82,465.76,0.0 +80,4096,4096,512,41,0,75.405,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,227.83,500.61,0.0 +80,8192,4096,512,41,0,137.5652,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,249.77,533.57,0.0 +80,16384,4096,512,41,0,262.5162,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,261.77,551.22,0.0 +80,20480,4096,512,41,0,322.8676,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,266.05,558.6,0.0 +80,16,7168,256,33,0,4.1533,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x256_1x4x1_16x16x64_default,14.14,498.03,0.0 +80,32,7168,256,15,0,4.3338,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x128_1x4x1_16x16x64_default,27.1,531.16,0.0 +80,64,7168,256,16,0,5.6176,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x128_1x4x1_16x16x64_default,41.81,492.9,0.0 +80,128,7168,256,45,0,7.0727,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,66.42,523.53,0.0 +80,256,7168,256,18,0,11.1402,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,84.34,500.04,0.0 +80,512,7168,256,45,0,16.3518,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,114.91,569.12,0.0 +80,1024,7168,256,45,0,27.5099,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,136.61,609.86,0.0 +80,1536,7168,256,45,0,36.8025,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,153.17,658.88,0.0 +80,2048,7168,256,41,0,47.1463,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,159.42,672.79,0.0 +80,4096,7168,256,41,0,85.1181,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,176.61,723.75,0.0 +80,8192,7168,256,41,0,156.5563,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,192.04,775.27,0.0 +80,16384,7168,256,41,0,301.4756,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,199.45,799.1,0.0 +80,20480,7168,256,41,0,373.7383,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,201.11,804.52,0.0 +80,32,7168,1536,51,0,9.8996,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x256_1x4x1_16x16x64_default,71.18,1163.48,0.0 +80,32,7168,576,41,0,13.2274,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,19.98,348.21,0.0 +80,32,2048,7168,2,0,12.4173,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,75.66,1211.25,0.0 +80,32,7168,512,49,0,5.5426,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,42.38,747.87,0.0 +80,32,256,7168,2,0,10.8848,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,10.79,191.16,0.0 +80,64,7168,1536,21,0,11.5816,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,121.68,1038.36,0.0 +80,64,7168,576,41,0,13.6023,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,38.85,373.7,0.0 +80,64,2048,7168,9,0,14.9748,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,125.48,1028.46,0.0 +80,64,7168,512,21,0,6.7758,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,69.33,681.88,0.0 +80,64,256,7168,2,0,11.4051,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,20.59,203.99,0.0 +80,96,7168,1536,17,0,14.9313,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x128_1x4x1_16x16x64_default,141.58,839.43,0.0 +80,96,7168,576,41,0,14.1469,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,56.04,393.04,0.0 +80,96,2048,7168,21,0,20.7002,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,136.16,761.41,0.0 +80,96,7168,512,17,0,8.226,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x128_1x4x1_16x16x64_default,85.66,619.43,0.0 +80,96,256,7168,30,0,11.4763,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,30.7,224.14,0.0 +80,128,7168,1536,51,0,16.5378,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x256_1x4x1_16x16x64_default,170.43,788.6,0.0 +80,128,7168,576,41,0,14.7401,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,71.71,409.6,0.0 +80,128,2048,7168,9,0,22.4085,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,167.71,719.45,0.0 +80,128,7168,512,23,0,10.1582,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,92.49,548.38,0.0 +80,128,256,7168,30,0,11.8782,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,39.55,237.25,0.0 +80,256,7168,1536,46,0,25.3317,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,222.53,595.04,0.0 +80,256,7168,576,71,0,17.4843,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x64_1x4x1_16x16x64_default,120.9,454.48,0.0 +80,256,2048,7168,24,0,34.809,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,215.93,504.57,0.0 +80,256,7168,512,18,0,14.0471,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,133.77,531.86,0.0 +80,256,256,7168,2,0,12.0719,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,77.83,314.87,0.0 +80,512,7168,1536,48,0,42.1033,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,267.78,454.51,0.0 +80,512,7168,576,71,0,22.909,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x64_1x4x1_16x16x64_default,184.55,513.5,0.0 +80,512,2048,7168,24,0,55.1913,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,272.37,370.48,0.0 +80,512,7168,512,48,0,22.4995,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,167.03,501.0,0.0 +80,512,256,7168,9,0,14.8883,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,126.21,387.36,0.0 +80,1024,7168,1536,48,0,77.1567,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,292.24,353.35,0.0 +80,1024,7168,576,41,0,39.3022,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,215.15,493.58,0.0 +80,1024,2048,7168,18,0,100.866,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,298.07,259.89,0.0 +80,1024,7168,512,46,0,39.1797,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,191.84,481.74,0.0 +80,1024,256,7168,9,0,22.0293,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,170.6,440.29,0.0 +80,2048,7168,1536,41,0,143.6136,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,314.02,303.01,0.0 +80,2048,7168,576,41,0,69.2915,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,244.06,500.33,0.0 +80,2048,2048,7168,18,0,177.5591,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,338.65,212.6,0.0 +80,2048,7168,512,41,0,68.617,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,219.08,496.65,0.0 +80,2048,256,7168,24,0,35.1531,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,213.81,499.63,0.0 +80,4096,7168,1536,41,0,264.7829,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,340.63,287.11,0.0 +80,4096,7168,576,41,0,125.0143,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,270.55,521.61,0.0 +80,4096,2048,7168,18,0,322.9083,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,372.42,188.34,0.0 +80,4096,7168,512,41,0,124.4324,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,241.62,518.25,0.0 +80,4096,256,7168,24,0,55.334,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,271.67,601.66,0.0 +80,8192,7168,1536,41,0,503.6996,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,358.13,280.0,0.0 +80,8192,7168,576,41,0,236.1366,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,286.47,534.81,0.0 +80,8192,2048,7168,48,0,610.2692,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,394.12,175.26,0.0 +80,8192,7168,512,41,0,230.7146,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,260.62,543.12,0.0 +80,8192,256,7168,68,0,102.6022,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x256_1x4x1_16x16x64_default,293.02,631.07,0.0 +80,16384,7168,1536,41,0,995.5304,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,362.4,272.27,0.0 +80,16384,7168,576,41,0,461.9614,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,292.86,537.81,0.0 +80,16384,2048,7168,48,0,1211.3868,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,397.1,164.46,0.0 +80,16384,7168,512,41,0,448.4529,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,268.16,550.65,0.0 +80,16384,256,7168,24,0,182.3289,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,329.79,700.19,0.0 +80,5112,6912,5120,54,0,899.3494,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,402.31,147.03,0.0 +80,5104,5120,8192,48,0,1053.0629,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,406.58,129.17,0.0 +80,2048,4096,5120,18,0,234.6564,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,366.06,205.55,0.0 +80,5120,5120,4096,48,0,547.1132,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,392.51,172.49,0.0 +80,5120,5120,8192,48,0,1062.5069,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,404.23,128.3,0.0 +80,32,2112,7168,2,0,12.3951,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,78.17,1250.77,0.0 +80,64,2112,7168,9,0,14.9879,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,129.29,1058.71,0.0 +80,96,2112,7168,9,0,22.2991,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,130.35,727.94,0.0 +80,128,2112,7168,9,0,22.6786,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,170.89,731.83,0.0 +80,256,2112,7168,22,0,35.8442,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x256_1x4x1_16x16x64_default,216.24,503.71,0.0 +80,512,2112,7168,131,0,65.1728,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x192x256_1x4x1_16x16x64_default,237.86,321.78,0.0 +80,1024,2112,7168,121,0,107.0765,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x64_default,289.55,250.33,0.0 +80,2048,2112,7168,26,0,183.0673,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x128_1x4x1_16x16x64_default,338.72,210.14,0.0 +80,4096,2112,7168,54,0,319.8768,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,387.7,193.2,0.0 +80,8192,2112,7168,54,0,628.2407,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,394.81,172.64,0.0 +80,16384,2112,7168,54,0,1249.322,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,397.07,161.52,0.0 diff --git a/aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv b/aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv new file mode 100755 index 0000000000..e97c52db23 --- /dev/null +++ b/aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv @@ -0,0 +1,244 @@ +M,N,K +1,9216,4096 +2,9216,4096 +4,9216,4096 +8,9216,4096 +16,9216,4096 +32,9216,4096 +64,9216,4096 +128,9216,4096 +256,9216,4096 +1024,9216,4096 +2048,9216,4096 +4096,9216,4096 +4240,9216,4096 +16384,9216,4096 +32768,9216,4096 +1,4608,4096 +2,4608,4096 +4,4608,4096 +8,4608,4096 +16,4608,4096 +32,4608,4096 +64,4608,4096 +128,4608,4096 +256,4608,4096 +1024,4608,4096 +2048,4608,4096 +4096,4608,4096 +16384,4608,4096 +32768,4608,4096 +1,1280,8192 +32,1280,8192 +64,1280,8192 +128,1280,8192 +192,1280,8192 +256,1280,8192 +320,1280,8192 +512,1280,8192 +1024,1280,8192 +2048,1280,8192 +4096,1280,8192 +8192,1280,8192 +16384,1280,8192 +1,8192,1024 +32,8192,1024 +64,8192,1024 +128,8192,1024 +192,8192,1024 +256,8192,1024 +320,8192,1024 +512,8192,1024 +1024,8192,1024 +2048,8192,1024 +4096,8192,1024 +8192,8192,1024 +16384,8192,1024 +16,1536,7168 +32,1536,7168 +64,1536,7168 +128,1536,7168 +256,1536,7168 +512,1536,7168 +1024,1536,7168 +1536,1536,7168 +2048,1536,7168 +4096,1536,7168 +8192,1536,7168 +16384,1536,7168 +20480,1536,7168 +16,3072,1536 +32,3072,1536 +64,3072,1536 +128,3072,1536 +256,3072,1536 +512,3072,1536 +1024,3072,1536 +1536,3072,1536 +2048,3072,1536 +4096,3072,1536 +8192,3072,1536 +16384,3072,1536 +20480,3072,1536 +16,576,7168 +32,576,7168 +64,576,7168 +128,576,7168 +256,576,7168 +512,576,7168 +1024,576,7168 +1536,576,7168 +2048,576,7168 +4096,576,7168 +8192,576,7168 +16384,576,7168 +20480,576,7168 +16,7168,2048 +32,7168,2048 +64,7168,2048 +128,7168,2048 +256,7168,2048 +512,7168,2048 +1024,7168,2048 +1536,7168,2048 +2048,7168,2048 +4096,7168,2048 +8192,7168,2048 +16384,7168,2048 +20480,7168,2048 +16,4608,7168 +32,4608,7168 +64,4608,7168 +128,4608,7168 +256,4608,7168 +512,4608,7168 +1024,4608,7168 +1536,4608,7168 +2048,4608,7168 +4096,4608,7168 +8192,4608,7168 +16384,4608,7168 +20480,4608,7168 +16,7168,2304 +32,7168,2304 +64,7168,2304 +128,7168,2304 +256,7168,2304 +512,7168,2304 +1024,7168,2304 +1536,7168,2304 +2048,7168,2304 +4096,7168,2304 +8192,7168,2304 +16384,7168,2304 +20480,7168,2304 +16,512,7168 +32,512,7168 +64,512,7168 +128,512,7168 +256,512,7168 +512,512,7168 +1024,512,7168 +1536,512,7168 +2048,512,7168 +4096,512,7168 +8192,512,7168 +16384,512,7168 +20480,512,7168 +16,4096,512 +32,4096,512 +64,4096,512 +128,4096,512 +256,4096,512 +512,4096,512 +1024,4096,512 +1536,4096,512 +2048,4096,512 +4096,4096,512 +8192,4096,512 +16384,4096,512 +20480,4096,512 +16,7168,256 +32,7168,256 +64,7168,256 +128,7168,256 +256,7168,256 +512,7168,256 +1024,7168,256 +1536,7168,256 +2048,7168,256 +4096,7168,256 +8192,7168,256 +16384,7168,256 +20480,7168,256 +32, 7168, 1536 +32, 7168, 576 +32, 2048, 7168 +32, 7168, 512 +32, 256, 7168 +64, 7168, 1536 +64, 7168, 576 +64, 2048, 7168 +64, 7168, 512 +64, 256, 7168 +96, 7168, 1536 +96, 7168, 576 +96, 2048, 7168 +96, 7168, 512 +96, 256, 7168 +128, 7168, 1536 +128, 7168, 576 +128, 2048, 7168 +128, 7168, 512 +128, 256, 7168 +256, 7168, 1536 +256, 7168, 576 +256, 2048, 7168 +256, 7168, 512 +256, 256, 7168 +512, 7168, 1536 +512, 7168, 576 +512, 2048, 7168 +512, 7168, 512 +512, 256, 7168 +1024, 7168, 1536 +1024, 7168, 576 +1024, 2048, 7168 +1024, 7168, 512 +1024, 256, 7168 +2048, 7168, 1536 +2048, 7168, 576 +2048, 2048, 7168 +2048, 7168, 512 +2048, 256, 7168 +4096, 7168, 1536 +4096, 7168, 576 +4096, 2048, 7168 +4096, 7168, 512 +4096, 256, 7168 +8192, 7168, 1536 +8192, 7168, 576 +8192, 2048, 7168 +8192, 7168, 512 +8192, 256, 7168 +16384, 7168, 1536 +16384, 7168, 576 +16384, 2048, 7168 +16384, 7168, 512 +16384, 256, 7168 +5112,6912,5120 +5104,5120,8192 +2048,4096,5120 +5120,5120,4096 +5120,5120,8192 +32, 2112, 7168 +64, 2112, 7168 +96, 2112, 7168 +128, 2112, 7168 +256, 2112, 7168 +512, 2112, 7168 +1024, 2112, 7168 +2048, 2112, 7168 +4096, 2112, 7168 +8192, 2112, 7168 +16384, 2112, 7168 diff --git a/aiter/jit/core.py b/aiter/jit/core.py index 112248cbb3..5f359c62a9 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -81,6 +81,12 @@ def mp_lock( "AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE", f"{AITER_ROOT_DIR}/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv", ) + +AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE = os.getenv( + "AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE", + f"{AITER_ROOT_DIR}/aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv", +) + AITER_CONFIG_GEMM_A8W8_BLOCKSCALE = os.getenv( "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE", f"{AITER_ROOT_DIR}/aiter/configs/a8w8_blockscale_tuned_gemm.csv", @@ -148,6 +154,9 @@ def update_config_files(file_path: str, merge_name: str): AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE = update_config_files( AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE, "a8w8_bpreshuffle_tuned_gemm" ) +AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE_FILE = update_config_files( + AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE, "a8w8_bpreshuffle_cktile_tuned_gemm" +) AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_FILE = update_config_files( AITER_CONFIG_GEMM_A8W8_BLOCKSCALE, "a8w8_blockscale_tuned_gemm" ) diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json index 42bccaa69a..1e345e2184 100644 --- a/aiter/jit/optCompilerConfig.json +++ b/aiter/jit/optCompilerConfig.json @@ -269,6 +269,25 @@ "is_standalone": "False", "blob_gen_cmd": "f'{AITER_CSRC_DIR}/ck_gemm_a8w8_bpreshuffle/gen_instances.py --working_path {{}} --tune_file {AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE}'" }, + "module_gemm_a8w8_bpreshuffle_cktile": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu'", + "f'{AITER_CSRC_DIR}/py_itfs_cu/gemm_common.cu'", + "f'{AITER_CSRC_DIR}/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [ + "f'{AITER_CSRC_DIR}/cktile_gemm_a8w8_bpreshuffle/include'", + "f'{CK_DIR}/example/ck_tile/18_flatmm'" + ], + "is_python_module": "True", + "is_standalone": "False", + "verbose": "False", + "hip_clang_path": "os.environ.get('GEMM_CKTILE_BPRESHUFFLE_HIP_CLANG_PATH')", + "blob_gen_cmd": "f'{AITER_CSRC_DIR}/cktile_gemm_a8w8_bpreshuffle/gen_instances.py --working_path {{}} --tune_file {AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE_FILE}'" + }, "module_gemm_a8w8_asm": { "srcs": [ "f'{AITER_CSRC_DIR}/pybind/gemm_a8w8_asm_pybind.cu'", @@ -529,6 +548,24 @@ "is_standalone": "False", "blob_gen_cmd": "f'{AITER_CSRC_DIR}/ck_gemm_a8w8_bpreshuffle/gen_instances.py --working_path {{}} --tune'" }, + "module_gemm_a8w8_bpreshuffle_cktile_tune": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/gemm_a8w8_bpreshuffle_cktile_tune_pybind.cu'", + "f'{AITER_CSRC_DIR}/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [ + "f'{AITER_CSRC_DIR}/cktile_gemm_a8w8_bpreshuffle/include'", + "f'{CK_DIR}/example/ck_tile/18_flatmm'" + ], + "verbose": "False", + "hip_clang_path": "os.environ.get('GEMM_CKTILE_BPRESHUFFLE_HIP_CLANG_PATH')", + "is_python_module": "True", + "is_standalone": "False", + "blob_gen_cmd": "f'{AITER_CSRC_DIR}/cktile_gemm_a8w8_bpreshuffle/gen_instances.py --working_path {{}} --tune'" + }, "module_aiter_operator": { "srcs": [ "f'{AITER_CSRC_DIR}/pybind/aiter_operator_pybind.cu'", diff --git a/aiter/ops/gemm_op_a8w8.py b/aiter/ops/gemm_op_a8w8.py index 16dc8faca7..e0ff62ccf1 100644 --- a/aiter/ops/gemm_op_a8w8.py +++ b/aiter/ops/gemm_op_a8w8.py @@ -13,6 +13,7 @@ AITER_CONFIG_GEMM_A8W8_FILE, AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_FILE, AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE, + AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE_FILE, AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE_FILE, AITER_LOG_TUNED_CONFIG, ) @@ -75,6 +76,30 @@ def gemm_a8w8_bpreshuffle_ck( ) -> torch.Tensor: ... +def gen_gemm_a8w8_bpreshuffle_cktile_fake_tensors( + XQ: torch.Tensor, + WQ: torch.Tensor, + x_scale: torch.Tensor, + w_scale: torch.Tensor, + Out: torch.Tensor, +) -> torch.Tensor: + return Out + + +@compile_ops( + "module_gemm_a8w8_bpreshuffle_cktile", + fc_name="gemm_a8w8_bpreshuffle_cktile", + gen_fake=gen_gemm_a8w8_bpreshuffle_cktile_fake_tensors, +) +def gemm_a8w8_bpreshuffle_cktile( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + out: Tensor, +) -> Tensor: ... + + def gen_gemm_a8w8_asm_fake_tensors( XQ: Tensor, # A:[M, K] i8 WQ: Tensor, # B:[N, K] i8 -> shuffle layout(32,16) @@ -578,3 +603,45 @@ def gemm_a8w8_blockscale_bpreshuffle_tune( kernelId: int = 0, splitK: int = 0, ) -> torch.Tensor: ... + + +def gemm_a8w8_bpreshuffle_CKTILE( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + bias: Optional[Tensor] = None, + dtype=torch.float16, + check=False, +): + assert dtype in [ + torch.bfloat16, + torch.float16, + ], f"Output {dtype=} is currently not supported in gemm_a8w8_bpreshuffle_CKTILE" + m = XQ.shape[0] + n = WQ.shape[0] + k = XQ.shape[-1] + + # get_bpreshuffle_GEMM_config( + # m, n, k, dtypes.fp8, AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE + # ) + get_CKGEMM_config(m, n, k, AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE_FILE) + assert WQ.dtype == dtypes.fp8, "gemm_a8w8_bpreshuffle_CKTILE only support fp8 now" + assert bias is None, "gemm_a8w8_bpreshuffle_CKTILE does not support bias now" + Y = torch.empty(m, n, dtype=dtype, device=XQ.device) + return gemm_a8w8_bpreshuffle_cktile(XQ, WQ, x_scale, w_scale, Y) + + +@compile_ops( + "module_gemm_a8w8_bpreshuffle_cktile_tune", + fc_name="gemm_a8w8_bpreshuffle_cktile_tune", +) +def gemm_a8w8_bpreshuffle_cktile_tune( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + out: Tensor, + kernelId: int, + splitK: int = 0, +) -> Tensor: ... diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/README.md b/csrc/cktile_gemm_a8w8_bpreshuffle/README.md new file mode 100644 index 0000000000..571632fe98 --- /dev/null +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/README.md @@ -0,0 +1,18 @@ +# CKTILE gemm a8w8 bpreshuffle tune + +1. Install aiter: +`python3 setup.py develop` + +2. Tune gemm a8w8: + First add GEMM shapes in `aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv`, then run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_bpreshuffle_cktile_tune via jit: +`GEMM_CKTILE_BPRESHUFFLE_HIP_CLANG_PATH=/data/llvm-project/build/bin/ python3 csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py -i aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv -o aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv` +If you want to use split K kernels, you can add the `-k` parameter at the end, notice that should change `bias` to `bias/(2^k)`. +You can find the results of the tuning in `aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv`. + +3. Test the performance, modify the test instance in `op_tests/testflatmm.py` and run it, please wait a few minutes as it will build gemm_a8w8_bpreshuffle_cktile kernels in `aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv` via jit: +`GEMM_CKTILE_BPRESHUFFLE_HIP_CLANG_PATH=/data/llvm-project/build/bin/ python3 op_tests/testflatmm.py` + + +## More +If you want to re-install gemm_a8w8_bpreshuffle_cktile, you should remove `aiter/jit/module_gemm_a8w8_bpreshuffle_cktile.so` and `aiter/jit/build/module_gemm_a8w8_bpreshuffle_cktile` first. +If you use flag `PREBUILD_KERNELS=1 USE_CK_A8W8=1` when you install aiter, it will build gemm a8w8 kernels in `aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv` by default. If you want to use the new result of gemm_a8w8_bpreshuffle_cktile_tune, please remove `build` and `*.so` first, then re-intall aiter after finishing tune. diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu new file mode 100755 index 0000000000..2836921bf9 --- /dev/null +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "gemm_a8w8_bpreshuffle_cktile_common.cuh" +#include "gemm_a8w8_bpreshuffle_cktile_lookup.h" +#include "gemm_a8w8_bpreshuffle_cktile_manifest.h" +#include "gemm_common.h" +#include + +using RowwiseKernel = std::function; + +// Define a custom hash function for std::tuple +struct IntTupleHash +{ + size_t operator()(const std::tuple& t) const + { + auto hash1 = std::hash{}(std::get<0>(t)); + auto hash2 = std::hash{}(std::get<1>(t)); + auto hash3 = std::hash{}(std::get<2>(t)); + return hash1 ^ hash2 ^ hash3; + } +}; + +using RowwiseKernelMap = std::unordered_map, RowwiseKernel, IntTupleHash>; + +template +RowwiseKernel rowwise_heuristic_dispatch(int M, int N, int K) +{ + + return a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x128_1x4x1_16x16x64_default< + DDataType, + EDataType>; + // if(K >= 1536) + // { + // if(M < 256 && K % 512 == 0) + // { + // return + // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default; + // } + // else + // { + // if(N < 1536) + // { + // return + // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x64x128_1x4x1_16x16x64_default; + // } + // else + // { + // return + // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x128_1x4x1_16x16x64_default; + // } + // } + // } + // else if(K >= 512) + // { + // if(M < 64) + // { + // return + // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default; + // } + // else if(M <= 256) + // { + // return + // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x64x128_1x4x1_16x16x64_default; + // } + // else + // { + // return + // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x64_1x4x1_16x16x64_default; + // } + // } + // else if(K >= 192 && K % 64 == 0) + // { + // if(M <= 256) + // { + // return + // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x256x64_1x4x1_16x16x64_default; + // } + // else + // { + // return + // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x64_1x4x1_16x16x64_default; + // } + // } + // else + { + TORCH_CHECK(false, + "Unsupported K for heuristic dispatch: ", + K, + ". Supported K greater than 192 and K % 64 == 0."); + } +} + +// Helper function to return the next largest power of 2 +static constexpr int nextPow2(unsigned int num) +{ + if(num <= 1) + return 1; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +template +RowwiseKernel rowwise_dispatch(int M, int N, int K) +{ + // For a given shape, either find the best kernel via lookup or heuristic. + // For many small M shapes, we bucket them to the next largest kernel. + // This is fine since kernels are padded anyway. + + static const auto lookup = [] { + if constexpr(std::is_same_v) + { + return RowwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType, F16)}; + } + else if constexpr(std::is_same_v) + { + return RowwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType, B16)}; + } + else + { + static_assert(false, "rowwise_dispatch used with unsupported dtype!"); + } + }(); + + // First check if this shape(M,N,K) is available in the direct lookup. + auto it = lookup.find({M, N, K}); + // If we found an optimal kernel, use it. + if(it != lookup.end()) + { + return it->second; + } + + int padded_m = M; + + // Fine-grained search + padded_m = getPaddedM(M, N, K, 0); + // Second check if this shape(padded_m,N,K) is available in the direct lookup. + it = lookup.find({padded_m, N, K}); + // If we found an optimal kernel, use it. + if(it != lookup.end()) + { + return it->second; + } + + // Coarse-grained search + padded_m = getPaddedM(M, N, K, 1); + // Third check if this shape(padded_m,N,K) is available in the direct lookup. + it = lookup.find({padded_m, N, K}); + // If we found an optimal kernel, use it. + if(it != lookup.end()) + { + return it->second; + } + + // Otherwise, use heuristics. + return rowwise_heuristic_dispatch(M, N, K); +} + +torch::Tensor gemm_a8w8_bpreshuffle_cktile(torch::Tensor& XQ, + torch::Tensor& WQ, + torch::Tensor& x_scale, + torch::Tensor& w_scale, + torch::Tensor& Y) +{ + TORCH_CHECK(XQ.dtype() == WQ.dtype(), "Weights and activations should have the same dtype!"); + TORCH_CHECK(x_scale.dtype() == w_scale.dtype(), "Scales should have the same dtype!"); + + int M = XQ.size(0); + int N = WQ.size(0); + int K = XQ.size(1); + + if(x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::Half) + { + rowwise_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y); + } + else if(x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::BFloat16) + { + rowwise_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y); + } + else + { + TORCH_CHECK(false, "Unsupported scales/output dtype!"); + } + return Y; +} diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py new file mode 100644 index 0000000000..5ce04751f8 --- /dev/null +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py @@ -0,0 +1,372 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +from dataclasses import dataclass +from aiter.jit.utils.chip_info import get_gfx + + +@dataclass +class kernelInstance: + sTransposeC: bool + sUseStructuredSparsity: bool + sTileParitionerGroupNum: int + sTileParitionerM01: int + sNumWaveGroups: int + sDoubleSmemBuffer: bool + PadM: bool + PadN: bool + PadK: bool + BlockPerCu: int + MTile: int + NTile: int + KTile: int + MWarp: int + NWarp: int + KWarp: int + MWTile: int + NWTile: int + KWTile: int + sScheduler: str + + @property + def name(self) -> str: + return ("_").join( + [ + "a8w8_bpreshuffle_cktile", + ("x").join( + map( + lambda x: str(x), + [ + self.sTransposeC, + self.sUseStructuredSparsity, + self.sTileParitionerGroupNum, + self.sTileParitionerM01, + self.sNumWaveGroups, + self.sDoubleSmemBuffer, + self.PadM, + self.PadN, + self.PadK, + self.BlockPerCu, + ], + ) + ), + ("x").join(map(lambda x: str(x), [self.MTile, self.NTile, self.KTile])), + ("x").join(map(lambda x: str(x), [self.MWarp, self.NWarp, self.KWarp])), + ("x").join( + map(lambda x: str(x), [self.MWTile, self.NWTile, self.KWTile]) + ), + self.sScheduler.lower(), + ] + ) + + +# fmt: off +# kernels_list_str = ''' +kernels_list_942 = { + 0: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 1: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 2: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 64, 512, 1, 4, 1, 16, 16, 64, "Default"), + 3: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 128, 512, 1, 4, 1, 16, 16, 64, "Default"), + 4: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 256, 512, 1, 4, 1, 16, 16, 64, "Default"), + 5: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 6: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 7: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 8: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 512, 256, 1, 4, 1, 16, 16, 64, "Default"), + 9: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 64, 512, 1, 4, 1, 16, 16, 64, "Default"), + 10: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 256, 64, 1, 4, 1, 16, 16, 64, "Default"), + 11: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 12: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 13: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 128, 64, 1, 4, 1, 16, 16, 64, "Default"), + 14: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 256, 128, 1, 4, 1, 16, 16, 64, "Default"), + 15: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 16: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 17: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 18: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 19: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 256, 128, 1, 4, 1, 16, 16, 64, "Default"), + 20: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 256, 128, 1, 4, 1, 16, 16, 64, "Default"), + 21: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 22: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 23: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 24: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 25: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 26: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 27: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 28: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 29: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 30: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 64, 512, 1, 4, 1, 16, 16, 64, "Default"), + 31: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 128, 512, 1, 4, 1, 16, 16, 64, "Default"), + 32: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 256, 512, 1, 4, 1, 16, 16, 64, "Default"), + 33: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 34: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 35: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 36: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 512, 256, 1, 4, 1, 16, 16, 64, "Default"), + 37: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 64, 512, 1, 4, 1, 16, 16, 64, "Default"), + 38: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 256, 64, 1, 4, 1, 16, 16, 64, "Default"), + 39: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 40: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 41: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 128, 64, 1, 4, 1, 16, 16, 64, "Default"), + 42: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 256, 128, 1, 4, 1, 16, 16, 64, "Default"), + 43: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 44: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 45: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 46: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 47: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 256, 128, 1, 4, 1, 16, 16, 64, "Default"), + 48: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 256, 128, 1, 4, 1, 16, 16, 64, "Default"), + 49: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 50: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 51: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 52: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 53: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 54: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 55: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 56: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 57: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 58: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 59: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 60: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 61: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 512, 256, 1, 4, 1, 16, 16, 64, "Default"), + 62: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 256, 64, 1, 4, 1, 16, 16, 64, "Default"), + 63: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 256, 512, 1, 4, 1, 16, 16, 64, "Default"), + 64: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 65: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 66: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 67: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 68: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 69: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 70: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 512, 256, 1, 4, 1, 16, 16, 64, "Default"), + 71: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 256, 64, 1, 4, 1, 16, 16, 64, "Default"), + 72: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 256, 512, 1, 4, 1, 16, 16, 64, "Default"), + 73: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 74: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 160, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 75: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 48, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 76: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 77: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 48, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 78: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 48, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 79: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 80: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 81: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 82: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 256, 128, 1, 4, 1, 16, 16, 64, "Default"), + 83: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 48, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 84: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 80, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 85: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 86: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 112, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 87: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 112, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 88: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 160, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 89: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 224, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 90: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 256, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 91: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 48, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 92: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 80, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 93: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 224, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 94: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 112, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 95: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 96: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 192, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 97: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 224, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 98: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 99: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 100: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 256, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 101: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 80, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 102: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 192, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 103: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 80, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 104: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 256, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 105: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 112, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 106: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 107: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 48, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 108: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 109: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 48, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 110: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 48, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 111: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 112: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 113: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 114: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 115: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 256, 128, 1, 4, 1, 16, 16, 64, "Default"), + 116: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 48, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 117: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 118: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 119: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 112, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 120: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 112, 64, 256, 1, 4, 1, 16, 16, 64, "Default"), + 121: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 160, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 122: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 123: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 192, 128, 1, 4, 1, 16, 16, 64, "Default"), + 124: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 48, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 125: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 126: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 127: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 112, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 128: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 129: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 192, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 130: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 131: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 132: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 133: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 134: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 135: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 192, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 136: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 137: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 138: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 112, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + +} +# ''' + +default_kernels_dict_942 = { + (-1): kernelInstance(0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + (-2):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 64, 512, 1, 4, 1, 16, 16, 64, "Default"), + (-3):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 64, 512, 1, 4, 1, 16, 16, 64, "Default"), + (-4):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 256, 64, 1, 4, 1, 16, 16, 64, "Default"), + (-5):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 128, 64, 1, 4, 1, 16, 16, 64, "Default"), + (-6):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + (-7):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 256, 128, 1, 4, 1, 16, 16, 64, "Default"), +} + +kernels_list_950 = { + 0: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 1: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 2: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), + 3: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 128, 512, 1, 4, 1, 16, 16, 128, "Default"), + 4: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 256, 512, 1, 4, 1, 16, 16, 128, "Default"), + 5: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 6: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 7: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 8: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 512, 256, 1, 4, 1, 16, 16, 128, "Default"), + 9: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), + 10: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 256, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), + 11: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 12: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 13: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 14: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), + 15: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 16: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 17: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 18: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 19: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), + 20: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), + 21: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 22: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 23: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 24: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 25: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 26: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 27: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 28: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 29: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 30: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), + 31: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 128, 512, 1, 4, 1, 16, 16, 128, "Default"), + 32: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 256, 512, 1, 4, 1, 16, 16, 128, "Default"), + 33: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 34: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 35: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 36: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 512, 256, 1, 4, 1, 16, 16, 128, "Default"), + 37: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), + 38: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 192, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 39: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 40: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 41: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 42: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), + 43: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 44: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 45: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 46: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 47: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), + 48: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), + 49: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 50: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 51: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 52: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 53: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 54: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 55: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 56: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 57: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 58: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 59: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 60: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 61: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 512, 256, 1, 4, 1, 16, 16, 128, "Default"), + 62: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 63: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 256, 512, 1, 4, 1, 16, 16, 128, "Default"), + 64: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 65: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 66: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 67: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 68: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 69: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 70: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 512, 256, 1, 4, 1, 16, 16, 128, "Default"), + 71: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 72: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 256, 512, 1, 4, 1, 16, 16, 128, "Default"), + 73: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 74: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 160, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 75: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 48, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 76: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 77: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 48, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 78: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 48, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 79: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 80: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 81: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 82: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), + 83: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 48, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 84: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 80, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 85: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 86: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 112, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 87: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 112, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 88: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 160, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 89: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 224, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 90: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 256, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 91: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 48, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 92: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 80, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 93: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 224, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 94: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 112, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 95: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 96: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 192, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 97: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 224, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 98: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 99: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 100: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 256, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 101: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 80, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 102: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 192, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 103: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 80, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 104: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 256, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 105: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 112, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 106: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 96, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 107: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 48, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 108: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 109: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 48, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 110: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 48, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 111: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 112: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 113: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 114: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 115: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), + 116: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 48, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 117: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 118: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 119: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 112, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 120: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 112, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), + 121: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 160, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 122: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 123: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), + 124: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 48, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + 125: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 126: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 127: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 112, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 128: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 129: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 192, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 130: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 131: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 132: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + + +} + +default_kernels_dict_950 = { + (-1): kernelInstance(0, 0, 8, 4, 1, 0, 0, 0, 0,1, 256, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), + # (-2):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 16, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), + # (-3):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 32, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), + # (-4):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 64, 256, 64, 1, 4, 1, 16, 16, 128, "Default"), + # (-5):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 128, 128, 64, 1, 4, 1, 16, 16, 128, "Default"), +} + +# fmt: on + +arch = get_gfx() +if arch == "gfx942": + kernels_list = kernels_list_942 + default_kernels_dict = default_kernels_dict_942 +else: + kernels_list = kernels_list_950 + default_kernels_dict = default_kernels_dict_950 diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu new file mode 100644 index 0000000000..cd52d0df56 --- /dev/null +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "gemm_a8w8_bpreshuffle_cktile_common.cuh" +#include "gemm_a8w8_bpreshuffle_cktile_lookup.h" +#include "gemm_a8w8_bpreshuffle_cktile_manifest.h" +#include "py_itfs_common.h" +#include + +using RowwiseKernel = std::function; + +// For certain high priority shapes, we directly use the best kernel rather +// than use heuristics. +using RowwiseKernelMap = std::unordered_map; + +// Helper function to return the next largest power of 2 +static constexpr int nextPow2(unsigned int num) +{ + if(num <= 1) + return 1; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +template +RowwiseKernel rowwise_dispatch(int id) +{ + // For a given shape, either find the best kernel via lookup or heuristic. + // For many small M shapes, we bucket them to the next largest kernel. + // This is fine since kernels are padded anyway. + + // First check if this shape is available in the direct lookup. + static const auto lookup = [] { + if constexpr(std::is_same_v) + { + return RowwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType, F16)}; + } + else if constexpr(std::is_same_v) + { + return RowwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType, B16)}; + } + else + { + static_assert(false, "rowwise_dispatch used with unsupported dtype!"); + } + }(); + + // DEBUG: Print lookup table size + static bool debug_printed = false; + if(!debug_printed) + { + std::cout << "[solinDEBUG] Lookup table size: " << lookup.size() << std::endl; + std::cout << "[solinDEBUG] Available kernel IDs: "; + for(const auto& kv : lookup) + { + std::cout << kv.first << " "; + } + std::cout << std::endl; + debug_printed = true; + } + + TORCH_CHECK(id < lookup.size(), + "Kernel id " + std::to_string(id) + + " is out of range! (lookup.size()=" + std::to_string(lookup.size()) + ")"); + auto it = lookup.find(id); + // If we found an optimal kernel, use it. + if(it != lookup.end()) + { + return it->second; + } + // Otherwise, use heuristics. + return lookup.find(0)->second; +} + +torch::Tensor gemm_a8w8_bpreshuffle_cktile_tune(torch::Tensor& XQ, + torch::Tensor& WQ, + torch::Tensor& x_scale, + torch::Tensor& w_scale, + torch::Tensor& Y, + int kernelId, + int splitK) +{ + TORCH_CHECK(XQ.dtype() == torch_fp8 && XQ.dtype() == WQ.dtype(), + "Weights and activations should both be fp8!"); + TORCH_CHECK(x_scale.dtype() == w_scale.dtype(), "Scales should have the same dtype!"); + std::optional bias = std::nullopt; + + int M = XQ.size(0); + int N = WQ.size(0); + int K = XQ.size(1); + int KBatch = std::pow(2, splitK); + + if(Y.dtype() == at::ScalarType::Half) + { + rowwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y); + } + else + { + TORCH_CHECK(false, "Unsupported scales/output dtype!"); + } + return Y; +} diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py new file mode 100755 index 0000000000..2c0781dbcc --- /dev/null +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +import os +import aiter +import pandas as pd +import torch +import torch.nn.functional as F +from aiter import dtypes +from aiter.jit.core import AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE +from aiter.utility.base_tuner import GemmCommonTuner +from aiter.ops.shuffle import shuffle_weight +from gemm_a8w8_bpreshuffle_cktile_common import kernels_list +import argparse +from aiter.utility.mp_tuner import mp_tuner + + +def run_torch(x, weight, x_scale, w_scale, bias=None, dtype=torch.float16): + x = x.to(dtypes.fp32) * x_scale + weight = weight.to(dtypes.fp32) * w_scale + out = F.linear(x, weight) + if bias is not None: + out = out.to(bias) + bias + return out.to(dtype) + + +def run_gemm_a8w8_bpreshuffle_cktile( + x, weight, x_scale, w_scale, out, kernel_id, splitK=0 +): + aiter.gemm_a8w8_bpreshuffle_cktile_tune( + x, weight, x_scale, w_scale, out, kernel_id, splitK + ) + return out + + +def generate_data(m, n, k, seed, dtype=dtypes.fp16, device="cuda"): + torch.manual_seed(seed) + x = torch.randn((m, k), dtype=dtype, device=device) + weight = torch.randn((n, k), dtype=dtype, device=device) + x, x_scale = aiter.pertoken_quant(x, quant_dtype=dtypes.fp8) + weight, w_scale = aiter.pertoken_quant(weight, quant_dtype=dtypes.fp8) + bias_f32 = None + weight_shuffle = shuffle_weight(weight, layout=(16, 16)) + out = torch.empty(m, n, dtype=dtype, device=device) + return x, weight_shuffle, x_scale, w_scale, out, weight, bias_f32 + + +class GemmA8W8BpreShuffleCktileTuner(GemmCommonTuner): + ARG_DEFAULTS = { + **GemmCommonTuner.ARG_DEFAULTS, + "tune_file": f"{AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE}", + "untune_file": "aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv", + } + + def _setup_specific_arguments(self): + pass + + def calculate(self, results, bpes=(1, 1, 2)): + ## bpes = (inbpe, w_bpe, outbpe) + return super().calculate(results, bpes=bpes) + + def getKernelName(self, kernelId): + if kernelId < 0 or kernelId > len(kernels_list): + return None + return kernels_list[kernelId].name + + def get_cktile_gemm_a8w8_bpreshuffle_tune_task( + self, + info_keys, + useSplitK, + seed, + ): + (cu_num, M, N, K) = info_keys + kernels_num = len(kernels_list) + gemm_a8w8_idx = [0, 1, 2, 3, 4] # input index in generate_data + ref_data_idx = [0, 5, 2, 3, 6] + tasks_ck = [] + for i in range(kernels_num): + kernel = kernels_list[i] + maxsplitK = ( + aiter.compute_gemm_SplitK( + M, + N, + K, + kernel.MPerBLOCK, + kernel.NPerBLOCK, + kernel.KPerBLOCK, + ) + if useSplitK + else 0 + ) + for splitK in range(maxsplitK + 1): + info = (info_keys, i, splitK, "") + tasks_ck.append( + ( + info, + generate_data, + (M, N, K, seed, dtypes.fp16), + run_gemm_a8w8_bpreshuffle_cktile, + ( + gemm_a8w8_idx, + i, + splitK, + ), + {}, + run_torch, + ( + ref_data_idx, + dtypes.fp16, + ), + {}, + None, + 1e-2, + 0.01, + ) + ) + return tasks_ck + + def tune( + self, + untunedf, + tunedf, + args, + ): + issorted = args.sort + useSplitK = args.splitK + mp_num = args.mp + shape_grouped = False + errRatio = args.errRatio + cu_num = self.get_cu_num() + task = [] + tasks_data = [] # [(kernel_nums, datas)] + seed = 10000 + for i in range(len(untunedf)): + M = untunedf.loc[i, "M"] + N = untunedf.loc[i, "N"] + K = untunedf.loc[i, "K"] + seed = seed + 1 + total_kernel_nums = 0 + kernels_num = len(kernels_list) + info_keys = (cu_num, M, N, K) + task.extend( + self.get_cktile_gemm_a8w8_bpreshuffle_tune_task( + info_keys, + useSplitK, + seed, + ) + ) + + total_kernel_nums = len(task) + + tasks_data.append((total_kernel_nums, ())) + ret = [] + if task: + ret = mp_tuner(task, tasks_data, mp_num, False, shape_grouped, errRatio) + + return ret + + +if __name__ == "__main__": + ## use default key and resultList + key = ["cu_num", "M", "N", "K"] + tuner = GemmA8W8BpreShuffleCktileTuner( + "GemmA8W8BpreShuffleCktileTuner", + key=key, + description="gen API for gemm a8w8 bpreshuffle kernel", + ) + + args = tuner.parse_args() + tuner.run(args, False) diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py b/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py new file mode 100755 index 0000000000..cbdbe0af15 --- /dev/null +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py @@ -0,0 +1,281 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +import os +import sys +from dataclasses import dataclass +import copy +from pathlib import Path +import pandas as pd +import argparse +import shutil +import torch +from gemm_a8w8_bpreshuffle_cktile_common import ( + kernelInstance, + kernels_list, + default_kernels_dict, +) + + +""" + +gemm_a8w8_bpreshuffle_cktile instance gen + +""" + + +class gemm_a8w8_bpreshuffle_cktile_codegen: + def __init__(self, working_path, istune=False): + self.working_path = working_path + self.impl_path = os.path.join(working_path, "impl") + self.instances_path = os.path.join(working_path, "instances") + self.istune = istune + + def gen_instance(self, k: kernelInstance): + INSTANCE_IMPL = f"""// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "gemm_a8w8_bpreshuffle_cktile_common.cuh" + +template +torch::Tensor +{k.name}( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y + ) +{{{{ + // The smallest kernel we have available. Works well for memory bound shapes. + + // Check if this input needs to be padded. + int M = size_to_dim_(XQ.dim() - 1, XQ.sizes()); + int N = WQ.size(0); + int K = WQ.size(1); + bool pad = (M % {k.MTile} != 0) || (N % {k.NTile} != 0) || (K % ({k.KTile}) != 0); + if (pad) + {{{{ + // pad + {{INSTANCE_CONTENT_pad}} + // pad + }}}} + else + {{{{ + // no pad + {{INSTANCE_CONTENT_nopad}} + // no pad + }}}} +}}}} + +""" + + INSTANCE_CONTENT_nobias = f"""using FlatmmInstance = CustomConfig< + DDataType, EDataType, + {k.sTransposeC},{k.sUseStructuredSparsity}, {k.sTileParitionerGroupNum}, + {k.sTileParitionerM01}, {k.sNumWaveGroups}, {k.sDoubleSmemBuffer}, + {k.PadM}, {k.PadN}, {k.PadK}, + {k.BlockPerCu}, + {k.MTile}, {k.NTile}, {k.KTile}, + {k.MWarp}, {k.NWarp}, {k.KWarp}, + {k.MWTile}, {k.NWTile}, {k.KWTile}, + ck_tile::GemmPipelineScheduler::{k.sScheduler}>; + // Run kernel instance. + return gemm_a8w8_bpreshuffle_cktile_impl(XQ, WQ, x_scale, w_scale, Y); +""" + if self.istune: + INSTANCE_IMPL_str = INSTANCE_IMPL.format( + INSTANCE_CONTENT_pad=( + INSTANCE_CONTENT_nobias.format(GemmSpec="MNKPadding") + ), + INSTANCE_CONTENT_nopad=( + INSTANCE_CONTENT_nobias.format(GemmSpec="Default") + ), + ) + else: + INSTANCE_IMPL_str = INSTANCE_IMPL.format( + INSTANCE_CONTENT_pad=INSTANCE_CONTENT_nobias.format( + GemmSpec="MNKPadding" + ), + INSTANCE_CONTENT_nopad=INSTANCE_CONTENT_nobias.format( + GemmSpec="Default" + ), + ) + + Path(os.path.join(self.impl_path, f"{k.name}.cuh")).write_text( + INSTANCE_IMPL_str + ) + + INSTANCE_template = """// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "impl/{name}.cuh" + +template torch::Tensor +{name}<{dtypes}>( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y + ); + +""" + INSTANCE_dFP32_eBF16 = INSTANCE_template.format(name=k.name, dtypes="F32, B16") + INSTANCE_dFP32_eFP16 = INSTANCE_template.format(name=k.name, dtypes="F32, F16") + # TODO: dFP8_eFP8 + + if self.istune: + Path( + os.path.join(self.instances_path, f"{k.name}_dFP32_eFP16.cpp") + ).write_text(INSTANCE_dFP32_eFP16) + else: + Path( + os.path.join(self.instances_path, f"{k.name}_dFP32_eBF16.cpp") + ).write_text(INSTANCE_dFP32_eBF16) + Path( + os.path.join(self.instances_path, f"{k.name}_dFP32_eFP16.cpp") + ).write_text(INSTANCE_dFP32_eFP16) + + def gen_lookup_dict(self, kernels_dict): + LOOKUP_head = """#pragma once +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifdef USE_ROCM + +#define GENERATE_LOOKUP_TABLE(DTYPE, ETYPE) \\ + { \\""" + + LOOKUP_template = """ + {{{MNK}, \\ + {kernel_name}}}, \\""" + + LOOKUP_end = """ + } + +#endif // USE_ROCM +""" + with open( + os.path.join(self.working_path, "gemm_a8w8_bpreshuffle_cktile_lookup.h"), + "w", + ) as f: + f.write(LOOKUP_head) + for mnk, k in kernels_dict.items(): + # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) + if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): + f.write( + LOOKUP_template.format( + MNK="{" + + (", ").join(map(lambda x: str(x), list(mnk))) + + "}", + kernel_name=k.name, + ) + ) + elif self.istune and isinstance(mnk, int): + f.write(LOOKUP_template.format(MNK=mnk, kernel_name=k.name)) + f.write(LOOKUP_end) + + def gen_manifest_head(self, kernels_dict): + MAINFEST_head = """#pragma once +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifdef USE_ROCM + +#include + +#include +""" + MAINFEST_template = """ +template +torch::Tensor +{kernel_name}( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y); +""" + MAINFEST_end = """ + +#endif // USE_ROCM +""" + + with open( + os.path.join(self.working_path, "gemm_a8w8_bpreshuffle_cktile_manifest.h"), + "w", + ) as f: + f.write(MAINFEST_head) + for mnk, k in kernels_dict.items(): + f.write(MAINFEST_template.format(kernel_name=k.name)) + f.write(MAINFEST_end) + + def gen_instances(self, kernels_dict): + if os.path.exists(self.impl_path): + shutil.rmtree(self.impl_path) + os.mkdir(self.impl_path) + if os.path.exists(self.instances_path): + shutil.rmtree(self.instances_path) + os.mkdir(self.instances_path) + + for mnk, k in kernels_dict.items(): + self.gen_instance(k) + + self.gen_lookup_dict(kernels_dict) + self.gen_manifest_head(kernels_dict) + + +def get_tune_dict(tune_dict_csv): + tune_dict = default_kernels_dict + if os.path.exists(tune_dict_csv): + tune_df = pd.read_csv(tune_dict_csv) + if torch.cuda.is_available(): + gpu = torch.cuda.current_device() + device_properties = torch.cuda.get_device_properties(gpu) + cu_num = device_properties.multi_processor_count + tune_df = tune_df[tune_df["cu_num"] == cu_num].reset_index() + for i in range(len(tune_df)): + M = tune_df.loc[i, "M"] + N = tune_df.loc[i, "N"] + K = tune_df.loc[i, "K"] + kid = tune_df.loc[i, "kernelId"] + if kid < 0 or kid > len(kernels_list): + continue + tune_dict[(M, N, K)] = kernels_list[kid] + return tune_dict + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CKTILE gemm a8w8 kernel", + ) + + # the directory for list_blobs/gen_blobs to write files into + parser.add_argument( + "-w", + "--working_path", + default="./", + required=False, + help="the path where all the blobs are going to be generated", + ) + + parser.add_argument( + "-f", + "--tune_file", + default="aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv", + required=False, + help="tune_file include the result after run gemm_a8w8_bpreshuffle_cktile_tune.py", + ) + + parser.add_argument( + "--tune", action="store_true", required=False, help="generated tune instanses" + ) + + args = parser.parse_args() + codegen = gemm_a8w8_bpreshuffle_cktile_codegen(args.working_path, args.tune) + + if args.tune: + codegen.gen_instances(kernels_list) + else: + codegen.gen_instances(get_tune_dict(args.tune_file)) diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile.h b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile.h new file mode 100644 index 0000000000..2eb83d065f --- /dev/null +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile.h @@ -0,0 +1,20 @@ +#pragma once +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#include + +torch::Tensor gemm_a8w8_bpreshuffle_cktile( + torch::Tensor &XQ, // [M, K] + torch::Tensor &WQ, // [N, K] -> [N/128, K*128] + torch::Tensor &x_scale, // [K/128, M] + torch::Tensor &w_scale, // [K/128, N/128] + torch::Tensor &out // Out:[M, N] fp16 +); +torch::Tensor gemm_a8w8_bpreshuffle_cktile_tune( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &out, + int kernelId, + int splitK); diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh new file mode 100644 index 0000000000..a98336ef31 --- /dev/null +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh @@ -0,0 +1,390 @@ +#pragma once +// SPDX-License-Identifier: MIT +// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +#ifdef USE_ROCM + +#undef __HIP_NO_HALF_OPERATORS__ +#undef __HIP_NO_HALF_CONVERSIONS__ + +#include +#include +#include +#include + +#include +#include +#include + + +#include + +#include "flatmm_basic.hpp" + +using F16 = ck_tile::half_t; +using BF16 = ck_tile::bf16_t; +using FP8 = ck_tile::fp8_t; +using F32 = float; +using B16 = ck_tile::bf16_t; +using ADataType = typename GemmBasicTypeConfig::ADataType; +using BDataType = typename GemmBasicTypeConfig::BDataType; +using CDataType = typename GemmBasicTypeConfig::CDataType; +using AccDataType = typename GemmBasicTypeConfig::AccDataType; +using ALayout = ck_tile::tensor_layout::gemm::RowMajor; +using BLayout = ck_tile::tensor_layout::gemm::ColumnMajor; +using CLayout = ck_tile::tensor_layout::gemm::RowMajor; + +template +static constexpr inline auto is_row_major(Layout layout_) +{ + return ck_tile::bool_constant, + ck_tile::tensor_layout::gemm::RowMajor>>{}; +} + +template +float flatmm_calc(const ck_tile::ScaleFlatmmHostArgs& args, + const ck_tile::stream_config& s) +{ + using CodegenFlatmmShape = ck_tile::TileGemmShape< + ck_tile::sequence, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = + ck_tile::GemmSpatiallyLocalTilePartitioner; + + using Traits = ck_tile::TileGemmTraits; + + using CodegenGemmTraits = ck_tile::TileGemmUniversalTraits; + + using GemmPipelineProblem = + ck_tile::GemmPipelineProblem; + + using BaseGemmPipeline = ck_tile::BaseFlatmmPipelineAGmemBGmemCRegV1; + + const ck_tile::index_t k_grain = args.k_batch * FlatmmConfig::K_Tile; + const ck_tile::index_t K_split = (args.K + k_grain - 1) / k_grain * FlatmmConfig::K_Tile; + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); + const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); + const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); + float ave_time{0}; + + const auto Run = [&](const auto has_hot_loop_, + const auto tail_number_, + const auto memory_operation_) { + constexpr bool has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_number_v = tail_number_.value; + constexpr auto scheduler = FlatmmConfig::Scheduler; + constexpr auto memory_operation = memory_operation_.value; + + using CodegenPipelineProblem = ck_tile::FlatmmPipelineProblem; + + using CodegenFlatmmPipeline = + ck_tile::FlatmmPipelineAGmemBGmemCRegV1; + + using GemmEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; + + // ToDo: Will add the codegen part to test different pipeline policies in GEMM. + // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. + using Kernel = ck_tile::FlatmmKernel; + + auto kargs = Kernel::MakeKernelArgs(args); + + const dim3 grids = Kernel::GridSize(kargs); + constexpr dim3 blocks = Kernel::BlockSize(); + + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + } + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" << CodegenFlatmmShape::GetName() << "\n" + << "Shape: " << CodegenFlatmmShape::GetName() << "\n" + << "problem: " << CodegenPipelineProblem::GetName() << "\n" + << "pipeline: " << CodegenFlatmmPipeline::GetName() << "\n" + << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + if(s.flush_cache_) + { + std::cout << "Flushing cache..." << std::endl; + static constexpr ck_tile::index_t APackedSize = + std::is_same_v ? 2 : 1; + static constexpr ck_tile::index_t BPackedSize = + std::is_same_v ? 2 : 1; + + ck_tile::HostTensor a_m(ck_tile::host_tensor_descriptor( + args.M, args.K, args.stride_A, is_row_major(ALayout{}))); + ck_tile::HostTensor b_n(ck_tile::host_tensor_descriptor( + args.K, args.N, args.stride_B, is_row_major(BLayout{}))); + + auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize; + auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize; + + ck_tile::RotatingMemWrapper rotating_mem( + kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer); + rotating_mem.Print(); + + auto run_flush_cache = [&]() { + // flush icache + ck_tile::flush_icache(); + // rotating mem + rotating_mem.Next(); + // clear c mem + if(args.k_batch > 1) + hipGetErrorString(hipMemsetAsync( + args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_)); + }; + ave_time = ck_tile::launch_kernel_time_mask( + s, + run_flush_cache, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + } + else + { + ave_time = ck_tile::launch_kernel( + s, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + } + return ave_time; + }; + + const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { + if(args.k_batch == 1) + { + Run(has_hot_loop_, + tail_number_, + ck_tile::integral_constant{}); + } + else + { + Run(has_hot_loop_, + tail_number_, + ck_tile::integral_constant{}); + } + }; + BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); + return ave_time; +} +template +struct CreateTileConfig +{ + static constexpr bool TransposeC = sTransposeC; + static constexpr bool UseStructuredSparsity = sUseStructuredSparsity; + static constexpr int TileParitionerGroupNum = sTileParitionerGroupNum; + static constexpr int TileParitionerM01 = sTileParitionerM01; + static constexpr ck_tile::index_t NumWaveGroups = sNumWaveGroups; + static constexpr bool DoubleSmemBuffer = sDoubleSmemBuffer; + static constexpr bool kPadM = PadM; + static constexpr bool kPadN = PadN; + static constexpr bool kPadK = PadK; + static constexpr int kBlockPerCu = BlockPerCu; + static constexpr int M_Tile = MTile; + static constexpr int N_Tile = NTile; + static constexpr int K_Tile = KTile; + static constexpr int M_Warp = MWarp; + static constexpr int N_Warp = NWarp; + static constexpr int K_Warp = KWarp; + static constexpr int M_Warp_Tile = MWTile; + static constexpr int N_Warp_Tile = NWTile; + static constexpr int K_Warp_Tile = KWTile; + static constexpr auto Scheduler = sScheduler; +}; + +template +using CustomConfig = CreateTileConfig; + +template +__forceinline__ torch::Tensor +gemm_a8w8_bpreshuffle_cktile_impl(torch::Tensor& XQ, + torch::Tensor& WQ, + torch::Tensor& x_scale, + torch::Tensor& w_scale, + torch::Tensor& out // Out:[M, N] fp16 +) +{ + TORCH_CHECK(XQ.dtype() == WQ.dtype(), "Weights and activations should have the same dtype!"); + TORCH_CHECK(x_scale.dtype() == w_scale.dtype(), "Scales should have the same dtype!"); + using ADataType = typename GemmBasicTypeConfig::ADataType; + using BDataType = typename GemmBasicTypeConfig::BDataType; + using CDataType = typename GemmBasicTypeConfig::CDataType; + using AccDataType = typename GemmBasicTypeConfig::AccDataType; + using DsDataType = ck_tile::tuple<>; + using ALayout = ck_tile::tensor_layout::gemm::RowMajor; + using BLayout = ck_tile::tensor_layout::gemm::ColumnMajor; + using CLayout = ck_tile::tensor_layout::gemm::RowMajor; + using DsLayout = ck_tile::tuple<>; + using CDEElementWise = ck_tile::element_wise::PassThrough; + int m = XQ.size(0); + int n = out.size(1); + int k = XQ.size(1); + + using ScaleM = typename ck_tile::FlatmmScalePointer<1>; + using ScaleN = typename ck_tile::FlatmmScalePointer<1>; + + + ck_tile::ScaleFlatmmHostArgs args; + args.a_ptr = (void*)XQ.data_ptr(); + args.b_ptr = (void*)WQ.data_ptr(); + args.scale_m = ck_tile::FlatmmScalePointer<1>{reinterpret_cast(x_scale.data_ptr()),m}; + args.scale_n = ck_tile::FlatmmScalePointer<1>{reinterpret_cast(w_scale.data_ptr()),n}; + args.e_ptr = (void*)out.data_ptr(); + + args.k_batch = 1; + args.M = m; + args.N = n; + args.K = k; + args.stride_A = k; + args.stride_B = k; + args.stride_C = n; + args.stride_E = n; + + const c10::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(XQ)); + const hipStream_t stream = at::hip::getCurrentHIPStream(); + ck_tile::stream_config naive_config{stream}; + flatmm_calc(args, naive_config); + + return out; +} + +#endif // USE_ROCM diff --git a/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu b/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu new file mode 100644 index 0000000000..8ac2135235 --- /dev/null +++ b/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#include "gemm_a8w8_bpreshuffle_cktile.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("gemm_a8w8_bpreshuffle_cktile", + &gemm_a8w8_bpreshuffle_cktile, + "gemm_a8w8_bpreshuffle_cktile", + py::arg("XQ"), + py::arg("WQ"), + py::arg("x_scale"), + py::arg("w_scale"), + py::arg("Out")); +} diff --git a/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_tune_pybind.cu b/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_tune_pybind.cu new file mode 100644 index 0000000000..a487d1f905 --- /dev/null +++ b/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_tune_pybind.cu @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#include "gemm_a8w8_bpreshuffle_cktile.h" +// #include "rocm_ops.hpp" +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("gemm_a8w8_bpreshuffle_cktile_tune", + &gemm_a8w8_bpreshuffle_cktile_tune, + "gemm_a8w8_bpreshuffle_cktile_tune", + py::arg("XQ"), + py::arg("WQ"), + py::arg("x_scale"), + py::arg("w_scale"), + py::arg("Out"), + py::arg("kernelId") = 0, + py::arg("splitK") = 0); +} diff --git a/csrc/rocm_ops.cpp b/csrc/rocm_ops.cpp index 25f4f64631..bec40f93a6 100644 --- a/csrc/rocm_ops.cpp +++ b/csrc/rocm_ops.cpp @@ -22,6 +22,7 @@ #include "gemm_a8w8.h" #include "gemm_a8w8_blockscale.h" #include "gemm_a8w8_bpreshuffle.h" +#include "gemm_a8w8_bpreshuffle_cktile.h" #include "gemm_common.h" #include "hipbsolgemm.cuh" #include "moe_ck.h" @@ -34,8 +35,8 @@ #include "rmsnorm.h" #include "rocsolgemm.cuh" #include "rope.h" -#include "smoothquant.h" #include "sample.h" +#include "smoothquant.h" #include // #include "torch/mha_batch_prefill.h" diff --git a/op_tests/testflatmm.py b/op_tests/testflatmm.py new file mode 100755 index 0000000000..bc28b7e411 --- /dev/null +++ b/op_tests/testflatmm.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +import torch +import torch.nn.functional as F +import random +import aiter +from aiter import dtypes +from aiter.ops.shuffle import shuffle_weight +from aiter.test_common import checkAllclose, perftest, benchmark +import pandas as pd +import argparse + +TEST_NUM_ITERS = 100 + + +@perftest(num_iters=TEST_NUM_ITERS) +def run_torch(x, weight, x_scale, w_scale, bias=None, dtype=dtypes.bf16): + x = x.to(dtypes.fp32) * x_scale + weight = weight.to(dtypes.fp32) * w_scale + out = F.linear(x, weight) + if bias is not None: + out = out.to(bias) + bias + return out.to(dtype) + + +@perftest(num_iters=TEST_NUM_ITERS) +def run_gemm_ck(x, weight, x_scale, w_scale, bias=None, dtype=dtypes.bf16): + return aiter.gemm_a8w8_CK(x, weight, x_scale, w_scale, bias, dtype) + + +@perftest() +def run_gemm_ck_bpreshuffle(x, weight, x_scale, w_scale, dtype=dtypes.bf16): + return aiter.gemm_a8w8_bpreshuffle(x, weight, x_scale, w_scale, None, dtype) + +@perftest() +def run_gemm_cktile_bpreshuffle(x, weight, x_scale, w_scale, dtype=dtypes.bf16): + return aiter.gemm_a8w8_bpreshuffle_CKTILE(x, weight, x_scale, w_scale, None, dtype) + +@perftest() +def run_gemm_asm(x, weightshuffle, x_scale, w_scale, bias=None, dtype=dtypes.bf16): + return aiter.gemm_a8w8_ASM(x, weightshuffle, x_scale, w_scale, bias) + +@benchmark() +def test_gemm(dtype, m, n, k, quantDtype=dtypes.i8): + dim = (m, n, k) + x = torch.randn((m, k), dtype=dtype, device="cuda") + weight = torch.randn((n, k), dtype=dtype, device="cuda") + x, x_scale = aiter.pertoken_quant(x, quant_dtype=quantDtype) + weight, w_scale = aiter.pertoken_quant(weight, quant_dtype=quantDtype) + weightshuffle = shuffle_weight(weight, layout=(16, 16)) + bias = torch.rand([1, n], dtype=dtype, device="cuda") * 10 + + # x_pad, _ = F.pad(x,(0,128), "constant", 0).split([x.shape[1], 128],dim=1) + # print(f"{x_pad.shape=}{x_pad.stride()}") + + a, avg_a = run_torch(x, weight, x_scale, w_scale, bias, dtype) + # b, avg_b = run_gemm_ck(x, weight, x_scale, w_scale, bias, dtype) + # err_b = checkAllclose(a, b, msg="ck: ", rtol=1e-2, atol=1e-2) + if quantDtype != dtypes.i8: + c, avg_c = run_gemm_ck_bpreshuffle(x, weightshuffle, x_scale, w_scale, dtype) + c = c + bias + err_c = checkAllclose(a, c, msg="ck bpreshuffle: ", rtol=1e-2, atol=1e-2) + f, avg_f = run_gemm_cktile_bpreshuffle(x, weightshuffle, x_scale, w_scale, dtype) + f = f + bias + err_f = checkAllclose(a, f, msg="cktile bpreshuffle: ", rtol=1e-2, atol=1e-2) + else: + avg_c = None + err_c = None + avg_f = None + err_f = None + gpu = torch.cuda.current_device() + device_properties = torch.cuda.get_device_properties(gpu) + cu_num = device_properties.multi_processor_count + cu_num = 80 + + return { + # "ck us": avg_b, + # "ck err": err_b, + "ck bpreshuffle us": avg_c, + "ck bpreshuffle err": err_c, + # "asm us": avg_d, + # "asm err": err_d, + "cktile bpreshuffle us": avg_f, + "cktile bpreshuffle err": err_f, + } + + +def test_normal_gemm_a8w8_pertoken_quant(l_dtype, l_quantDtype, l_mnk): + df = [] + for dtype in l_dtype: + for quantDtype in l_quantDtype: + for m, n, k in l_mnk: + ret = test_gemm(dtype, m, n, k, quantDtype) + df.append(ret) + df = pd.DataFrame(df) + aiter.logger.info(f"summary:\n{df}") + + +l_dtype = ["fp16"] +l_quantDtype = ["fp8"] +# l_mnk_nm = [ +# # (2048, 4096, 5120), +# (1024, 1024, 1024), +# ] +l_m = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] +l_nk = [ + (3072, 1536), + (7168, 256), + (7168, 2048), + (4608, 7168), + (7168, 2304), + (512, 7168), +] +l_mnk_nm = [(m, n, k) for m in l_m for n, k in l_nk] + +parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter, + description="config input of test", +) +parser.add_argument( + "-d", + "--dtype", + type=str, + choices=l_dtype, + nargs="?", + const=None, + default=None, + help="""Data type. + e.g.: -d bf16""", +) +parser.add_argument( + "-q", + "--quantDtype", + type=str, + choices=l_quantDtype, + nargs="?", + const=None, + default=None, + help="""Date type of quantization. + e.g.: -q fp8""", +) +parser.add_argument( + "-mnk", + type=dtypes.str2tuple, + nargs="?", + const=None, + default=None, + help="""shape of mnk. + e.g. -mnk 1280,8192,1024""", +) + +args = parser.parse_args() +if args.dtype is None: + l_dtype = [dtypes.d_dtypes[key] for key in l_dtype] +else: + l_dtype = [dtypes.d_dtypes[args.dtype]] +if args.quantDtype is None: + l_quantDtype = [dtypes.d_dtypes[key] for key in l_quantDtype] +else: + l_quantDtype = [dtypes.d_dtypes[args.quantDtype]] +if args.mnk is not None: + l_mnk_nm = [args.mnk] + +test_normal_gemm_a8w8_pertoken_quant(l_dtype, l_quantDtype, l_mnk_nm) From 78ddd595c28104bb690ad247548743a99cfda76e Mon Sep 17 00:00:00 2001 From: solin Date: Mon, 10 Nov 2025 02:48:59 +0000 Subject: [PATCH 02/13] refine code --- csrc/include/rocm_ops.hpp | 33 ++++++++++++++++--- .../gemm_a8w8_bpreshuffle_cktile_pybind.cu | 14 ++------ ...emm_a8w8_bpreshuffle_cktile_tune_pybind.cu | 17 ++-------- 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index d9cf04dee1..2e91cd330b 100755 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -474,6 +474,29 @@ py::arg("Out"), \ py::arg("kernelId") = 0, \ py::arg("splitK") = 0); + +#define GEMM_A8W8_BPRESHUFFLE_CKTILE_PYBIND \ + m.def("gemm_a8w8_bpreshuffle_cktile", \ + &gemm_a8w8_bpreshuffle_cktile, \ + "gemm_a8w8_bpreshuffle_cktile", \ + py::arg("XQ"), \ + py::arg("WQ"), \ + py::arg("x_scale"), \ + py::arg("w_scale"), \ + py::arg("Out")); + +#define GEMM_A8W8_BPRESHUFFLE_CKTILE_TUNE_PYBIND \ + m.def("gemm_a8w8_bpreshuffle_cktile_tune", \ + &gemm_a8w8_bpreshuffle_cktile_tune, \ + "gemm_a8w8_bpreshuffle_cktile_tune", \ + py::arg("XQ"), \ + py::arg("WQ"), \ + py::arg("x_scale"), \ + py::arg("w_scale"), \ + py::arg("Out"), \ + py::arg("kernelId") = 0, \ + py::arg("splitK") = 0); + #define MHA_BWD_ASM_PYBIND \ m.def("fmha_v3_bwd", \ &aiter::torch_itfs::fmha_v3_bwd, \ @@ -1196,11 +1219,11 @@ "hipb_findallsols", \ py::arg("mat1"), \ py::arg("mat2"), \ - py::arg("bias") = std::nullopt, \ - py::arg("out_dtype") = std::nullopt, \ - py::arg("scaleA") = std::nullopt, \ - py::arg("scaleB") = std::nullopt, \ - py::arg("scaleC") = std::nullopt, \ + py::arg("bias") = std::nullopt, \ + py::arg("out_dtype") = std::nullopt, \ + py::arg("scaleA") = std::nullopt, \ + py::arg("scaleB") = std::nullopt, \ + py::arg("scaleC") = std::nullopt, \ py::arg("bpreshuffle") = false); \ m.def("getHipblasltKernelName", &getHipblasltKernelName); diff --git a/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu b/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu index 8ac2135235..b453764779 100644 --- a/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu +++ b/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu @@ -1,15 +1,5 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include "gemm_a8w8_bpreshuffle_cktile.h" - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("gemm_a8w8_bpreshuffle_cktile", - &gemm_a8w8_bpreshuffle_cktile, - "gemm_a8w8_bpreshuffle_cktile", - py::arg("XQ"), - py::arg("WQ"), - py::arg("x_scale"), - py::arg("w_scale"), - py::arg("Out")); -} +#include "rocm_ops.hpp" +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { GEMM_A8W8_BPRESHUFFLE_CKTILE_PYBIND; } diff --git a/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_tune_pybind.cu b/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_tune_pybind.cu index a487d1f905..aaa0ba69f7 100644 --- a/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_tune_pybind.cu +++ b/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_tune_pybind.cu @@ -1,17 +1,6 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include "gemm_a8w8_bpreshuffle_cktile.h" -// #include "rocm_ops.hpp" -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("gemm_a8w8_bpreshuffle_cktile_tune", - &gemm_a8w8_bpreshuffle_cktile_tune, - "gemm_a8w8_bpreshuffle_cktile_tune", - py::arg("XQ"), - py::arg("WQ"), - py::arg("x_scale"), - py::arg("w_scale"), - py::arg("Out"), - py::arg("kernelId") = 0, - py::arg("splitK") = 0); -} +#include "rocm_ops.hpp" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { GEMM_A8W8_BPRESHUFFLE_CKTILE_TUNE_PYBIND; } From 9915dc77fff49a5560e74b384fccf5266ce72e10 Mon Sep 17 00:00:00 2001 From: solin Date: Thu, 13 Nov 2025 06:27:22 +0000 Subject: [PATCH 03/13] refine code --- .../gemm_a8w8_bpreshuffle_cktile.cu | 72 +------- .../gemm_a8w8_bpreshuffle_cktile_common.py | 47 ++--- .../gemm_a8w8_bpreshuffle_cktile_tune.py | 2 +- .../gemm_a8w8_bpreshuffle_cktile_common.cuh | 3 - op_tests/test_gemm_a8w8.py | 74 +++++--- op_tests/testflatmm.py | 165 ------------------ 6 files changed, 72 insertions(+), 291 deletions(-) delete mode 100755 op_tests/testflatmm.py diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu index 2836921bf9..54727af540 100755 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu @@ -27,77 +27,9 @@ using RowwiseKernelMap = std::unordered_map, RowwiseKe template RowwiseKernel rowwise_heuristic_dispatch(int M, int N, int K) { - + // Use default kernel for all architectures return a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x128_1x4x1_16x16x64_default< - DDataType, - EDataType>; - // if(K >= 1536) - // { - // if(M < 256 && K % 512 == 0) - // { - // return - // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default; - // } - // else - // { - // if(N < 1536) - // { - // return - // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x64x128_1x4x1_16x16x64_default; - // } - // else - // { - // return - // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x128_1x4x1_16x16x64_default; - // } - // } - // } - // else if(K >= 512) - // { - // if(M < 64) - // { - // return - // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default; - // } - // else if(M <= 256) - // { - // return - // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x64x128_1x4x1_16x16x64_default; - // } - // else - // { - // return - // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x64_1x4x1_16x16x64_default; - // } - // } - // else if(K >= 192 && K % 64 == 0) - // { - // if(M <= 256) - // { - // return - // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x256x64_1x4x1_16x16x64_default; - // } - // else - // { - // return - // a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x64_1x4x1_16x16x64_default; - // } - // } - // else - { - TORCH_CHECK(false, - "Unsupported K for heuristic dispatch: ", - K, - ". Supported K greater than 192 and K % 64 == 0."); - } + DDataType, EDataType>; } // Helper function to return the next largest power of 2 diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py index 5ce04751f8..f20c110652 100644 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py @@ -191,16 +191,16 @@ def name(self) -> str: 126: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), 127: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 112, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), 128: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), - 129: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 192, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), - 130: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), - 131: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), - 132: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), - 133: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), - 134: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), - 135: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 192, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), - 136: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), - 137: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), - 138: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 112, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 129: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 192, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 130: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 131: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 132: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 128, 256, 1, 4, 1, 16, 16, 64, "Default"), + 133: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + 134: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), + 135: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 192, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 136: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 192, 256, 1, 4, 1, 16, 16, 64, "Default"), + 137: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 64, 128, 1, 4, 1, 16, 16, 64, "Default"), + 138: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 112, 256, 256, 1, 4, 1, 16, 16, 64, "Default"), } # ''' @@ -226,10 +226,10 @@ def name(self) -> str: 7: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), 8: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 512, 256, 1, 4, 1, 16, 16, 128, "Default"), 9: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), - 10: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 256, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), + 10: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 256, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), 11: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), 12: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 64, 256, 1, 4, 1, 16, 16, 128, "Default"), - 13: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 13: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), 14: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 128, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), 15: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), 16: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), @@ -278,7 +278,7 @@ def name(self) -> str: 59: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 64, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), 60: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 32, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), 61: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 512, 256, 1, 4, 1, 16, 16, 128, "Default"), - 62: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 62: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 80, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), 63: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 256, 512, 1, 4, 1, 16, 16, 128, "Default"), 64: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 16, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), 65: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), @@ -287,7 +287,7 @@ def name(self) -> str: 68: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 64, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), 69: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 32, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), 70: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 512, 256, 1, 4, 1, 16, 16, 128, "Default"), - 71: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), + 71: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 256, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), 72: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 256, 512, 1, 4, 1, 16, 16, 128, "Default"), 73: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 16, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), 74: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 1, 160, 192, 128, 1, 4, 1, 16, 16, 128, "Default"), @@ -345,20 +345,21 @@ def name(self) -> str: 126: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 64, 128, 1, 4, 1, 16, 16, 128, "Default"), 127: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 112, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), 128: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 128, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), - 129: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 192, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), - 130: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), - 131: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), - 132: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), + 129: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 192, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 130: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 224, 128, 128, 1, 4, 1, 16, 16, 128, "Default"), + 131: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 192, 256, 1, 4, 1, 16, 16, 128, "Default"), + 132: kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0, 2, 96, 128, 256, 1, 4, 1, 16, 16, 128, "Default"), } default_kernels_dict_950 = { - (-1): kernelInstance(0, 0, 8, 4, 1, 0, 0, 0, 0,1, 256, 256, 128, 1, 4, 1, 16, 16, 128, "Default"), - # (-2):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 16, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), - # (-3):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 32, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), - # (-4):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 64, 256, 64, 1, 4, 1, 16, 16, 128, "Default"), - # (-5):kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 128, 128, 64, 1, 4, 1, 16, 16, 128, "Default"), + (-1): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 128, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), + (-2): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 16, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), + (-3): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 32, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), + (-4): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 64, 256, 64, 1, 4, 1, 16, 16, 128, "Default"), + (-5): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 128, 128, 64, 1, 4, 1, 16, 16, 128, "Default"), + (-6): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 128, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), } # fmt: on diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py index 2c0781dbcc..5069a7c7bf 100755 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py @@ -162,7 +162,7 @@ def tune( tuner = GemmA8W8BpreShuffleCktileTuner( "GemmA8W8BpreShuffleCktileTuner", key=key, - description="gen API for gemm a8w8 bpreshuffle kernel", + description="gen API for gemm a8w8 bpreshuffle cktile kernel", ) args = tuner.parse_args() diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh index a98336ef31..5b5503fee8 100644 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh @@ -15,10 +15,7 @@ #include #include #include - - #include - #include "flatmm_basic.hpp" using F16 = ck_tile::half_t; diff --git a/op_tests/test_gemm_a8w8.py b/op_tests/test_gemm_a8w8.py index b7ee6c208a..05587aba57 100755 --- a/op_tests/test_gemm_a8w8.py +++ b/op_tests/test_gemm_a8w8.py @@ -57,6 +57,11 @@ def run_gemm_ck_bpreshuffle(x, weight, x_scale, w_scale, dtype=dtypes.bf16): return aiter.gemm_a8w8_bpreshuffle(x, weight, x_scale, w_scale, None, dtype) +@perftest() +def run_gemm_cktile_bpreshuffle(x, weight, x_scale, w_scale, dtype=dtypes.bf16): + return aiter.gemm_a8w8_bpreshuffle_CKTILE(x, weight, x_scale, w_scale, None, dtype) + + @perftest() def run_gemm_asm(x, weightshuffle, x_scale, w_scale, bias=None, dtype=dtypes.bf16): return aiter.gemm_a8w8_ASM(x, weightshuffle, x_scale, w_scale, bias) @@ -130,6 +135,15 @@ def test_gemm(dtype, m, n, k, quantDtype=dtypes.i8): else: avg_e = None err_e = None + if quantDtype == dtypes.fp8 and dtype == dtypes.fp16: + f, avg_f = run_gemm_cktile_bpreshuffle( + x, weightshuffle, x_scale, w_scale, dtype + ) + f = f + bias + err_f = checkAllclose(a, f, msg="cktile bpreshuffle: ", rtol=1e-2, atol=1e-2) + else: + avg_f = None + err_f = None return { "ck us": avg_b, "ck err": err_b, @@ -139,6 +153,8 @@ def test_gemm(dtype, m, n, k, quantDtype=dtypes.i8): "asm err": err_d, "hipmm bpreshuffle us": avg_e, "hipmm bpreshuffle err": err_e, + "cktile bpreshuffle us": avg_f, + "cktile bpreshuffle err": err_f, } @@ -346,35 +362,35 @@ def test_skinny_gemm_a8w8_pertoken_quant(): (128, 1280, 8192), (192, 1280, 8192), (256, 1280, 8192), - (320, 1280, 8192), - (512, 1280, 8192), - (1024, 1280, 8192), - (2048, 1280, 8192), - (4096, 1280, 8192), - (8192, 1280, 8192), - (16384, 1280, 8192), - # attn_out - (1, 8192, 1024), - (32, 8192, 1024), - (64, 8192, 1024), - (128, 8192, 1024), - (192, 8192, 1024), - (256, 8192, 1024), - (320, 8192, 1024), - (512, 8192, 1024), - (1024, 8192, 1024), - (2048, 8192, 1024), - (4096, 8192, 1024), - (8192, 8192, 1024), - (16384, 8192, 1024), - # hipmm preshuffle - (16, 7424, 8192), - (32, 7424, 8192), - (48, 7424, 8192), - (64, 7424, 8192), - (4096, 7424, 8192), - (5120, 7424, 8192), - (8192, 7424, 8192), + # (320, 1280, 8192), + # (512, 1280, 8192), + # (1024, 1280, 8192), + # (2048, 1280, 8192), + # (4096, 1280, 8192), + # (8192, 1280, 8192), + # (16384, 1280, 8192), + # # attn_out + # (1, 8192, 1024), + # (32, 8192, 1024), + # (64, 8192, 1024), + # (128, 8192, 1024), + # (192, 8192, 1024), + # (256, 8192, 1024), + # (320, 8192, 1024), + # (512, 8192, 1024), + # (1024, 8192, 1024), + # (2048, 8192, 1024), + # (4096, 8192, 1024), + # (8192, 8192, 1024), + # (16384, 8192, 1024), + # # hipmm preshuffle + # (16, 7424, 8192), + # (32, 7424, 8192), + # (48, 7424, 8192), + # (64, 7424, 8192), + # (4096, 7424, 8192), + # (5120, 7424, 8192), + # (8192, 7424, 8192), ] parser = argparse.ArgumentParser( diff --git a/op_tests/testflatmm.py b/op_tests/testflatmm.py deleted file mode 100755 index bc28b7e411..0000000000 --- a/op_tests/testflatmm.py +++ /dev/null @@ -1,165 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. - -import torch -import torch.nn.functional as F -import random -import aiter -from aiter import dtypes -from aiter.ops.shuffle import shuffle_weight -from aiter.test_common import checkAllclose, perftest, benchmark -import pandas as pd -import argparse - -TEST_NUM_ITERS = 100 - - -@perftest(num_iters=TEST_NUM_ITERS) -def run_torch(x, weight, x_scale, w_scale, bias=None, dtype=dtypes.bf16): - x = x.to(dtypes.fp32) * x_scale - weight = weight.to(dtypes.fp32) * w_scale - out = F.linear(x, weight) - if bias is not None: - out = out.to(bias) + bias - return out.to(dtype) - - -@perftest(num_iters=TEST_NUM_ITERS) -def run_gemm_ck(x, weight, x_scale, w_scale, bias=None, dtype=dtypes.bf16): - return aiter.gemm_a8w8_CK(x, weight, x_scale, w_scale, bias, dtype) - - -@perftest() -def run_gemm_ck_bpreshuffle(x, weight, x_scale, w_scale, dtype=dtypes.bf16): - return aiter.gemm_a8w8_bpreshuffle(x, weight, x_scale, w_scale, None, dtype) - -@perftest() -def run_gemm_cktile_bpreshuffle(x, weight, x_scale, w_scale, dtype=dtypes.bf16): - return aiter.gemm_a8w8_bpreshuffle_CKTILE(x, weight, x_scale, w_scale, None, dtype) - -@perftest() -def run_gemm_asm(x, weightshuffle, x_scale, w_scale, bias=None, dtype=dtypes.bf16): - return aiter.gemm_a8w8_ASM(x, weightshuffle, x_scale, w_scale, bias) - -@benchmark() -def test_gemm(dtype, m, n, k, quantDtype=dtypes.i8): - dim = (m, n, k) - x = torch.randn((m, k), dtype=dtype, device="cuda") - weight = torch.randn((n, k), dtype=dtype, device="cuda") - x, x_scale = aiter.pertoken_quant(x, quant_dtype=quantDtype) - weight, w_scale = aiter.pertoken_quant(weight, quant_dtype=quantDtype) - weightshuffle = shuffle_weight(weight, layout=(16, 16)) - bias = torch.rand([1, n], dtype=dtype, device="cuda") * 10 - - # x_pad, _ = F.pad(x,(0,128), "constant", 0).split([x.shape[1], 128],dim=1) - # print(f"{x_pad.shape=}{x_pad.stride()}") - - a, avg_a = run_torch(x, weight, x_scale, w_scale, bias, dtype) - # b, avg_b = run_gemm_ck(x, weight, x_scale, w_scale, bias, dtype) - # err_b = checkAllclose(a, b, msg="ck: ", rtol=1e-2, atol=1e-2) - if quantDtype != dtypes.i8: - c, avg_c = run_gemm_ck_bpreshuffle(x, weightshuffle, x_scale, w_scale, dtype) - c = c + bias - err_c = checkAllclose(a, c, msg="ck bpreshuffle: ", rtol=1e-2, atol=1e-2) - f, avg_f = run_gemm_cktile_bpreshuffle(x, weightshuffle, x_scale, w_scale, dtype) - f = f + bias - err_f = checkAllclose(a, f, msg="cktile bpreshuffle: ", rtol=1e-2, atol=1e-2) - else: - avg_c = None - err_c = None - avg_f = None - err_f = None - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = device_properties.multi_processor_count - cu_num = 80 - - return { - # "ck us": avg_b, - # "ck err": err_b, - "ck bpreshuffle us": avg_c, - "ck bpreshuffle err": err_c, - # "asm us": avg_d, - # "asm err": err_d, - "cktile bpreshuffle us": avg_f, - "cktile bpreshuffle err": err_f, - } - - -def test_normal_gemm_a8w8_pertoken_quant(l_dtype, l_quantDtype, l_mnk): - df = [] - for dtype in l_dtype: - for quantDtype in l_quantDtype: - for m, n, k in l_mnk: - ret = test_gemm(dtype, m, n, k, quantDtype) - df.append(ret) - df = pd.DataFrame(df) - aiter.logger.info(f"summary:\n{df}") - - -l_dtype = ["fp16"] -l_quantDtype = ["fp8"] -# l_mnk_nm = [ -# # (2048, 4096, 5120), -# (1024, 1024, 1024), -# ] -l_m = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] -l_nk = [ - (3072, 1536), - (7168, 256), - (7168, 2048), - (4608, 7168), - (7168, 2304), - (512, 7168), -] -l_mnk_nm = [(m, n, k) for m in l_m for n, k in l_nk] - -parser = argparse.ArgumentParser( - formatter_class=argparse.RawTextHelpFormatter, - description="config input of test", -) -parser.add_argument( - "-d", - "--dtype", - type=str, - choices=l_dtype, - nargs="?", - const=None, - default=None, - help="""Data type. - e.g.: -d bf16""", -) -parser.add_argument( - "-q", - "--quantDtype", - type=str, - choices=l_quantDtype, - nargs="?", - const=None, - default=None, - help="""Date type of quantization. - e.g.: -q fp8""", -) -parser.add_argument( - "-mnk", - type=dtypes.str2tuple, - nargs="?", - const=None, - default=None, - help="""shape of mnk. - e.g. -mnk 1280,8192,1024""", -) - -args = parser.parse_args() -if args.dtype is None: - l_dtype = [dtypes.d_dtypes[key] for key in l_dtype] -else: - l_dtype = [dtypes.d_dtypes[args.dtype]] -if args.quantDtype is None: - l_quantDtype = [dtypes.d_dtypes[key] for key in l_quantDtype] -else: - l_quantDtype = [dtypes.d_dtypes[args.quantDtype]] -if args.mnk is not None: - l_mnk_nm = [args.mnk] - -test_normal_gemm_a8w8_pertoken_quant(l_dtype, l_quantDtype, l_mnk_nm) From 5c6e7db1bcc2a115be8232e8048ac6d3f9f26b76 Mon Sep 17 00:00:00 2001 From: solin Date: Thu, 13 Nov 2025 06:31:38 +0000 Subject: [PATCH 04/13] refine code --- op_tests/test_gemm_a8w8.py | 58 +++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/op_tests/test_gemm_a8w8.py b/op_tests/test_gemm_a8w8.py index 05587aba57..e5e695e004 100755 --- a/op_tests/test_gemm_a8w8.py +++ b/op_tests/test_gemm_a8w8.py @@ -362,35 +362,35 @@ def test_skinny_gemm_a8w8_pertoken_quant(): (128, 1280, 8192), (192, 1280, 8192), (256, 1280, 8192), - # (320, 1280, 8192), - # (512, 1280, 8192), - # (1024, 1280, 8192), - # (2048, 1280, 8192), - # (4096, 1280, 8192), - # (8192, 1280, 8192), - # (16384, 1280, 8192), - # # attn_out - # (1, 8192, 1024), - # (32, 8192, 1024), - # (64, 8192, 1024), - # (128, 8192, 1024), - # (192, 8192, 1024), - # (256, 8192, 1024), - # (320, 8192, 1024), - # (512, 8192, 1024), - # (1024, 8192, 1024), - # (2048, 8192, 1024), - # (4096, 8192, 1024), - # (8192, 8192, 1024), - # (16384, 8192, 1024), - # # hipmm preshuffle - # (16, 7424, 8192), - # (32, 7424, 8192), - # (48, 7424, 8192), - # (64, 7424, 8192), - # (4096, 7424, 8192), - # (5120, 7424, 8192), - # (8192, 7424, 8192), + (320, 1280, 8192), + (512, 1280, 8192), + (1024, 1280, 8192), + (2048, 1280, 8192), + (4096, 1280, 8192), + (8192, 1280, 8192), + (16384, 1280, 8192), + # attn_out + (1, 8192, 1024), + (32, 8192, 1024), + (64, 8192, 1024), + (128, 8192, 1024), + (192, 8192, 1024), + (256, 8192, 1024), + (320, 8192, 1024), + (512, 8192, 1024), + (1024, 8192, 1024), + (2048, 8192, 1024), + (4096, 8192, 1024), + (8192, 8192, 1024), + (16384, 8192, 1024), + # hipmm preshuffle + (16, 7424, 8192), + (32, 7424, 8192), + (48, 7424, 8192), + (64, 7424, 8192), + (4096, 7424, 8192), + (5120, 7424, 8192), + (8192, 7424, 8192), ] parser = argparse.ArgumentParser( From 7d4205977ab6230e075c4e4680a53b77fe4d446b Mon Sep 17 00:00:00 2001 From: solin Date: Thu, 13 Nov 2025 08:26:19 +0000 Subject: [PATCH 05/13] refine --- .../gemm_a8w8_bpreshuffle_cktile_common.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py index f20c110652..3795682f94 100644 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py @@ -357,9 +357,7 @@ def name(self) -> str: (-1): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 128, 256, 256, 1, 4, 1, 16, 16, 128, "Default"), (-2): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 16, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), (-3): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 32, 64, 512, 1, 4, 1, 16, 16, 128, "Default"), - (-4): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 64, 256, 64, 1, 4, 1, 16, 16, 128, "Default"), - (-5): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 128, 128, 64, 1, 4, 1, 16, 16, 128, "Default"), - (-6): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 128, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), + (-4): kernelInstance( 0, 0, 8, 4, 1, 0, 0, 0, 0,1, 128, 128, 128, 1, 4, 1, 16, 16, 64, "Default"), } # fmt: on From 848a15442b5e1fdbbc46fee41adb7dffa1f0b7a1 Mon Sep 17 00:00:00 2001 From: solin Date: Thu, 13 Nov 2025 08:45:16 +0000 Subject: [PATCH 06/13] refine --- .../gemm_a8w8_bpreshuffle_cktile_tune.cu | 16 +--------------- .../gemm_a8w8_bpreshuffle_cktile_common.cuh | 1 - 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu index cd52d0df56..ac6291d173 100644 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu @@ -44,21 +44,7 @@ RowwiseKernel rowwise_dispatch(int id) static_assert(false, "rowwise_dispatch used with unsupported dtype!"); } }(); - - // DEBUG: Print lookup table size - static bool debug_printed = false; - if(!debug_printed) - { - std::cout << "[solinDEBUG] Lookup table size: " << lookup.size() << std::endl; - std::cout << "[solinDEBUG] Available kernel IDs: "; - for(const auto& kv : lookup) - { - std::cout << kv.first << " "; - } - std::cout << std::endl; - debug_printed = true; - } - + TORCH_CHECK(id < lookup.size(), "Kernel id " + std::to_string(id) + " is out of range! (lookup.size()=" + std::to_string(lookup.size()) + ")"); diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh index 5b5503fee8..c19fb3e15c 100644 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh @@ -143,7 +143,6 @@ float flatmm_calc(const ck_tile::ScaleFlatmmHostArgs& args, 1, false>>; - // ToDo: Will add the codegen part to test different pipeline policies in GEMM. // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. using Kernel = ck_tile::FlatmmKernel; From 96819e08fc3f378c70e8177d45077995e3b89517 Mon Sep 17 00:00:00 2001 From: solin Date: Fri, 14 Nov 2025 03:42:23 +0000 Subject: [PATCH 07/13] fix merge conflict --- aiter/jit/core.py | 461 +++++++++---------------------- aiter/jit/optCompilerConfig.json | 93 +++++++ 2 files changed, 218 insertions(+), 336 deletions(-) diff --git a/aiter/jit/core.py b/aiter/jit/core.py index 5f359c62a9..358bb73142 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -14,7 +14,7 @@ import traceback import types import typing -from typing import Any, Callable, List, Optional, Union, get_args, get_origin +from typing import Any, Callable, List, Optional from packaging.version import Version, parse @@ -23,7 +23,7 @@ from chip_info import get_gfx from cpp_extension import _jit_compile, get_hip_version from file_baton import FileBaton -from torch_guard import is_torch_equal_or_newer, torch_compile_guard # noqa: E402 +from torch_guard import torch_compile_guard # noqa: E402 AITER_REBUILD = int(os.environ.get("AITER_REBUILD", "0")) @@ -68,7 +68,81 @@ def mp_lock( AITER_LOG_MORE = int(os.getenv("AITER_LOG_MORE", 0)) AITER_LOG_TUNED_CONFIG = int(os.getenv("AITER_LOG_TUNED_CONFIG", 0)) + # config_env start here +def update_config_files(file_path: str, merge_name: str): + path_list = file_path.split(os.pathsep) if file_path else [] + if len(path_list) <= 1: + return file_path + df_list = [] + ## merge config files + ##example: AITER_CONFIG_GEMM_A4W4="/path1:/path2" + import pandas as pd + + df_list.append(pd.read_csv(path_list[0])) + for i, path in enumerate(path_list[1:]): + if os.path.exists(path): + df = pd.read_csv(path) + ## check columns + assert ( + df.columns.tolist() == df_list[0].columns.tolist() + ), f"Column mismatch between {path_list[0]} and {path}, {df_list[0].columns.tolist()}, {df.columns.tolist()}" + + df_list.append(df) + else: + print(f"path {i+1}: {path} (not exist)") + merge_df = pd.concat(df_list, ignore_index=True) if df_list else pd.DataFrame() + ## get keys from untuned file to drop_duplicates + untuned_name = ( + re.sub(r"(?:_)?tuned$", r"\1untuned", merge_name) + if re.search(r"(?:_)?tuned$", merge_name) + else merge_name.replace("tuned", "untuned") + ) + untuned_path = f"{AITER_ROOT_DIR}/aiter/configs/{untuned_name}.csv" + if os.path.exists(untuned_path): + untunedf = pd.read_csv(untuned_path) + keys = untunedf.columns + merge_df = ( + merge_df.sort_values("us") + .drop_duplicates(subset=keys, keep="first") + .reset_index(drop=True) + ) + else: + logger.warning( + f"Untuned config file not found: {untuned_path}. Using all columns for deduplication." + ) + new_file_path = f"/tmp/{merge_name}.csv" + merge_df.to_csv(new_file_path, index=False) + return new_file_path + + +# @functools.lru_cache(maxsize=1) +def get_config_file(env_name, default_file, tuned_file_name): + config_env_file = os.getenv(env_name) + # default_file = f"{AITER_ROOT_DIR}/aiter/configs/{tuned_file_name}.csv" + from pathlib import Path + + if not config_env_file: + model_config_dir = Path(f"{AITER_ROOT_DIR}/aiter/configs/model_configs/") + op_tuned_file_list = [ + p + for p in model_config_dir.glob(f"*{tuned_file_name}*") + if (p.is_file() and "untuned" not in str(p)) + ] + + if not op_tuned_file_list: + config_file = default_file + else: + tuned_files = ":".join(str(p) for p in op_tuned_file_list) + tuned_files = default_file + ":" + tuned_files + print(f"merge tuned file under model_configs/ and configs/ ", tuned_files) + config_file = update_config_files(tuned_files, tuned_file_name) + else: + config_file = update_config_files(config_env_file, tuned_file_name) + # print(f"get config file from environment ", config_file) + return config_file + + AITER_CONFIG_GEMM_A4W4 = os.getenv( "AITER_CONFIG_GEMM_A4W4", f"{AITER_ROOT_DIR}/aiter/configs/a4w4_blockscale_tuned_gemm.csv", @@ -107,7 +181,7 @@ def mp_lock( ) AITER_CONFIG_BF16_BATCHED_GEMM = os.getenv( - "AITER_CONFIG_BATCHED_GEMM_BF16", + "AITER_CONFIG_BF16_BATCHED_GEMM", f"{AITER_ROOT_DIR}/aiter/configs/bf16_tuned_batched_gemm.csv", ) @@ -115,65 +189,54 @@ def mp_lock( "AITER_CONFIG_GEMM_BF16", f"{AITER_ROOT_DIR}/aiter/configs/tuned_gemm.csv", ) +AITER_CONFIG_GEMM_A4W4_FILE = get_config_file( + "AITER_CONFIG_GEMM_A4W4", AITER_CONFIG_GEMM_A4W4, "a4w4_blockscale_tuned_gemm" +) - -def update_config_files(file_path: str, merge_name: str): - path_list = file_path.split(os.pathsep) if file_path else [] - if len(path_list) <= 1: - return file_path - df_list = [] - ## merge config files - ##example: AITER_CONFIG_GEMM_A4W4="/path1:/path2" - import pandas as pd - - df_list.append(pd.read_csv(path_list[0])) - for i, path in enumerate(path_list[1:]): - if os.path.exists(path): - df = pd.read_csv(path) - ## check columns - assert ( - df.columns.tolist() == df_list[0].columns.tolist() - ), f"Column mismatch between {path_list[0]} and {path}, {df_list[0].columns.tolist()}, {df.columns.tolist()}" - - df_list.append(df) - else: - print(f"path {i+1}: {path} (not exist)") - merge_df = pd.concat(df_list, ignore_index=True) if df_list else pd.DataFrame() - merge_df = merge_df.drop_duplicates(keep="last") - new_file_path = f"/tmp/{merge_name}.csv" - merge_df.to_csv(new_file_path, index=False) - return new_file_path - - -AITER_CONFIG_GEMM_A4W4_FILE = update_config_files( - AITER_CONFIG_GEMM_A4W4, "a4w4_blockscale_tuned_gemm" +AITER_CONFIG_GEMM_A8W8_FILE = get_config_file( + "AITER_CONFIG_GEMM_A8W8", AITER_CONFIG_GEMM_A8W8, "a8w8_tuned_gemm" ) -AITER_CONFIG_GEMM_A8W8_FILE = update_config_files( - AITER_CONFIG_GEMM_A8W8, "a8w8_tuned_gemm" +AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE = get_config_file( + "AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE", + AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE, + "a8w8_bpreshuffle_tuned_gemm", ) -AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE = update_config_files( - AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE, "a8w8_bpreshuffle_tuned_gemm" +AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE_FILE = get_config_file( + "AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE", + AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE, + "a8w8_bpreshuffle_cktile_tuned_gemm", ) -AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE_FILE = update_config_files( - AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE, "a8w8_bpreshuffle_cktile_tuned_gemm" +AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_FILE = get_config_file( + "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE", + AITER_CONFIG_GEMM_A8W8_BLOCKSCALE, + "a8w8_blockscale_tuned_gemm", ) -AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_FILE = update_config_files( - AITER_CONFIG_GEMM_A8W8_BLOCKSCALE, "a8w8_blockscale_tuned_gemm" +AITER_CONFIG_FMOE_FILE = get_config_file( + "AITER_CONFIG_FMOE", AITER_CONFIG_FMOE, "tuned_fmoe" ) -AITER_CONFIG_FMOE_FILE = update_config_files(AITER_CONFIG_FMOE, "tuned_fmoe") -AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE_FILE = update_config_files( + +AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE_FILE = get_config_file( + "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE", AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE, "a8w8_blockscale_bpreshuffle_tuned_gemm", ) -AITER_CONFIG_A8W8_BATCHED_GEMM_FILE = update_config_files( - AITER_CONFIG_A8W8_BATCHED_GEMM, "a8w8_tuned_batched_gemm" + +AITER_CONFIG_A8W8_BATCHED_GEMM_FILE = get_config_file( + "AITER_CONFIG_A8W8_BATCHED_GEMM", + AITER_CONFIG_A8W8_BATCHED_GEMM, + "a8w8_tuned_batched_gemm", ) -AITER_CONFIG_BF16_BATCHED_GEMM_FILE = update_config_files( - AITER_CONFIG_BF16_BATCHED_GEMM, "bf16_tuned_batched_gemm" + +AITER_CONFIG_BF16_BATCHED_GEMM_FILE = get_config_file( + "AITER_CONFIG_BF16_BATCHED_GEMM", + AITER_CONFIG_BF16_BATCHED_GEMM, + "bf16_tuned_batched_gemm", ) -AITER_CONFIG_GEMM_BF16_FILE = update_config_files( - AITER_CONFIG_GEMM_BF16, "bf16_tuned_gemm" + +AITER_CONFIG_GEMM_BF16_FILE = get_config_file( + "AITER_CONFIG_GEMM_BF16", AITER_CONFIG_GEMM_BF16, "bf16_tuned_gemm" ) + # config_env end here find_aiter = importlib.util.find_spec("aiter") @@ -673,160 +736,6 @@ def convert(d_ops: dict): ) -MANUAL_SCHEMA_OPS = [ - "register_graph_buffers", - "module_moe_ck2stages", - "mha_fwd", - "fmha_v3_fwd", - "mha_varlen_fwd", - "mha_bwd", - "fmha_v3_bwd", - "mha_varlen_bwd", - "fmha_v3_varlen_bwd", - "fmha_v3_varlen_fwd", - "mha_batch_prefill", - "hipb_findallsols", - "rocb_findallsols", - "_ActivationType", - "_QuantType", - "init_custom_ar", - "greedy_sample", - "random_sample_outer_exponential", - "random_sample", - "mixed_sample", - "exponential", -] - -NONE_WRAPPED_OP = [ - # "hipb_create_extension", - # "hipb_destroy_extension", - "getHipblasltKernelName", - # "rocb_create_extension", - # "rocb_destroy_extension", - "get_graph_buffer_ipc_meta", - "_ActivationType", - "_QuantType", - # "dispose", - # "meta_size", - # "get_padded_m", - "compile_mha_fwd", - "compile_mha_bwd", - "init_custom_qr", - "qr_max_size", - "qr_destroy", - "qr_open_handles", - "qr_get_handle", -] - -# We default all args are inplace, you can define inplace args for specific op -SPECIAL_OPS_MUTATES_ARGS = {} - - -def generate_schema(func) -> str: - import inspect - - import torch - - sig = inspect.signature(func) - parameters = [] - mutates_args = SPECIAL_OPS_MUTATES_ARGS.get(func.__name__, []) - for idx, (name, param) in enumerate(sig.parameters.items()): - param_type = param.annotation - flag = True - is_mutates = True - if len(mutates_args) > 0 and name not in mutates_args: - is_mutates = False - - if param_type is torch.Tensor: - if is_mutates: - type_str = f"Tensor(a{idx}!)" - else: - type_str = "Tensor" - elif param_type == Optional[torch.Tensor]: - if is_mutates: - type_str = f"Tensor(a{idx}!)?" - else: - type_str = "Tensor?" - elif get_origin(param_type) is Union and torch.Tensor in get_args(param_type): - if is_mutates: - type_str = f"Tensor(a{idx}!)?" - else: - type_str = "Tensor?" - elif param_type in (torch.SymInt, int): - type_str = "SymInt" - elif param_type in (float, bool, str): - type_str = param_type.__name__ - elif param_type == Optional[torch.Generator]: - type_str = "Generator?" - elif ( - get_origin(param_type) in (list, List) - and get_args(param_type)[0] is torch.Tensor - ): - if is_mutates: - type_str = f"Tensor(a{idx}!)[]" - else: - type_str = "Tensor[]" - elif get_origin(param_type) in (list, List) and get_args(param_type)[0] is int: - type_str = "int[]" - elif param_type == Optional[torch.dtype]: - type_str = "ScalarType?" - else: - type_str = "*" - flag = False - if flag: - param_str = f"{type_str} {name}" - - if param.default != inspect.Parameter.empty: - if param.default is None: - param_str += "=None" - else: - param_str += f"={param.default}" - else: - param_str = f"{type_str} " - - parameters.append(param_str) - return_annotation = sig.return_annotation - return_type = "" - if return_annotation is type(None) or return_annotation is None: - return_type = "()" - elif return_annotation is torch.Tensor: - return_type = "Tensor" - elif ( - get_origin(return_annotation) is list and get_args(return_annotation)[0] is int - ): - return_type = "int[]" - elif return_annotation is int: - return_type = "int" - elif return_annotation is float: - return_type = "float" - elif return_annotation is bool: - return_type = "bool" - elif ( - get_origin(return_annotation) is list - and get_args(return_annotation)[0] is torch.Tensor - ): - return_type = "Tensor[]" - elif get_origin(return_annotation) is tuple: - args = get_args(return_annotation) - type_strings = [] - for arg in args: - if arg is torch.Tensor: - type_strings.append("Tensor") - elif arg is int: - type_strings.append("int") - elif arg is float: - type_strings.append("float") - elif arg is bool: - type_strings.append("bool") - return_type = f"({', '.join(type_strings)})" - else: - return_type = "Any" - - schema = f"({', '.join(parameters)}) -> {return_type}" - - return schema - - def compile_ops( _md_name: str, fc_name: Optional[str] = None, @@ -924,6 +833,13 @@ def check_args(): doc_str = op.__doc__.split("\n")[0] doc_str = re.sub(r"<(.*?)\:.*?>", r"\g<1>", doc_str) doc_str = doc_str.replace("list[", "List[") + doc_str = doc_str.replace("tuple[", "Tuple[") + doc_str = doc_str.replace("collections.abc.Sequence[", "List[") + doc_str = doc_str.replace("typing.SupportsInt", "int") + doc_str = doc_str.replace("typing.SupportsFloat", "float") + # A|None --> Optional[A] + pattern = r"([\w\.]+(?:\[[^\]]+\])?)\s*\|\s*None" + doc_str = re.sub(pattern, r"Optional[\1]", doc_str) for el in enum_types: doc_str = re.sub(f" aiter.*{el} ", f" {el} ", doc_str) namespace = { @@ -932,6 +848,7 @@ def check_args(): "torch": torch, "typing": typing, } + exec( f"from aiter import*\ndef {doc_str}: pass", namespace, @@ -992,138 +909,10 @@ def check_args(): return op(*args, **kwargs) - if func.__name__ in NONE_WRAPPED_OP: - return wrapper - - def wrapper_register(func): - import inspect - - import torch - import torch.library - from torch.library import Library - - global aiter_lib - aiter_lib = Library("aiter", "FRAGMENT") if aiter_lib is None else aiter_lib - schema = "" - if func.__name__ in MANUAL_SCHEMA_OPS: - schema = generate_schema(func) - else: - sig = inspect.signature(func) - mutates_args = SPECIAL_OPS_MUTATES_ARGS.get(func.__name__, "unknown") - if hasattr(torch.library, "infer_schema"): - sig = torch.library.infer_schema(func, mutates_args=mutates_args) - else: - # for pytorch 2.4 - import torch._custom_op.impl - - # torch 2.4 not support mutates "unknown" for inplace all param - if mutates_args == "unknown": - mutates_args = [] - - for param_name, param in sig.parameters.items(): - if param.annotation == torch.Tensor: - mutates_args.append(param_name) - - sig = torch._custom_op.impl.infer_schema(func, mutates_args) - schema = f"{sig}" - return schema - - schema = wrapper_register(func) - - import inspect - - import torch - - sig = inspect.signature(func) - input_is_tensor = False - parameters = list(sig.parameters.values()) - - if parameters: - first_param = parameters[0] - if ( - first_param.annotation is not inspect.Parameter.empty - and first_param.annotation is torch.Tensor - ): - input_is_tensor = True - - input_part, output_part = schema.split("->", 1) - if input_is_tensor: - new_input = input_part - else: - if not sig.parameters: - new_input = "(Tensor dummy)" - else: - new_input = "(Tensor dummy, " + input_part[1:] - - return_int = False - return_annotation = sig.return_annotation - if return_annotation is int: - output_part = "(Tensor, " + output_part + ")" - return_int = True - - schema = f"{new_input} -> {output_part}".strip() - - loadName = func.__name__ - - def abstract_impl(*args, custom_build_args={}, **kwargs): - if return_int: - return torch.empty(1, device="cuda"), 1 - if gen_fake is not None: - return gen_fake(*args, **kwargs) - return func(*args, **kwargs) - - def outer_wrapper(*args, **kwargs): - return ( - wrapper(*args, **kwargs) - if not return_int - else (torch.empty(1, device="cuda"), wrapper(*args, **kwargs)) - ) - - def abstract_impl_dummy(dummy, *args, custom_build_args={}, **kwargs): - if return_int: - return torch.empty(1, device="cuda"), 1 - if gen_fake is not None: - return gen_fake(*args, **kwargs) - return func(*args, **kwargs) - - def outer_wrapper_dummy(dummy, *args, **kwargs): - return ( - wrapper(*args, **kwargs) - if not return_int - else (torch.empty(1, device="cuda"), wrapper(*args, **kwargs)) - ) - - custom_func = outer_wrapper - fake_func = abstract_impl - if not input_is_tensor: - custom_func = outer_wrapper_dummy - fake_func = abstract_impl_dummy - - if not hasattr(torch.ops.aiter, f"wrapper_{loadName}"): - if is_torch_equal_or_newer("2.8.0"): - tags = () - else: - tags = (torch.Tag.needs_fixed_stride_order,) - op_schema = f"aiter::wrapper_{loadName}" + schema - aiter_lib.define(op_schema, tags=tags) - aiter_lib.impl( - f"aiter::wrapper_{loadName}", custom_func, dispatch_key="CUDA" - ) - aiter_lib.impl( - f"aiter::wrapper_{loadName}", custom_func, dispatch_key="CPU" - ) - aiter_lib._register_fake(f"wrapper_{loadName}", fake_func) - - def wrapper_custom(*args, custom_build_args={}, **kwargs): - result = ( - getattr(torch.ops.aiter, f"wrapper_{loadName}")(*args, **kwargs) - if input_is_tensor - else getattr(torch.ops.aiter, f"wrapper_{loadName}")( - torch.empty(1, device="cuda"), *args, **kwargs - ) - ) - return result[1] if return_int else result + @torch_compile_guard(device="cuda", gen_fake=gen_fake, calling_func_=func) + def custom_wrapper(*args, **kwargs): + return wrapper(*args, **kwargs) - return wrapper_custom + return custom_wrapper return decorator diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json index 1e345e2184..5a55f8b5e8 100644 --- a/aiter/jit/optCompilerConfig.json +++ b/aiter/jit/optCompilerConfig.json @@ -269,6 +269,25 @@ "is_standalone": "False", "blob_gen_cmd": "f'{AITER_CSRC_DIR}/ck_gemm_a8w8_bpreshuffle/gen_instances.py --working_path {{}} --tune_file {AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE}'" }, + "module_deepgemm": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/deepgemm_pybind.cu'", + "f'{AITER_CSRC_DIR}/ck_deepgemm/deepgemm.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "md_name": "'module_deepgemm'", + "extra_ldflags": "None", + "extra_include": [ + "f'{CK_DIR}/example/ck_tile/18_flatmm'", + "f'{AITER_CSRC_DIR}/ck_deepgemm/include'" + ], + "verbose": "False", + "is_python_module": "True", + "is_standalone": "False", + "hip_clang_path": "os.environ.get('FLATMM_HIP_CLANG_PATH')", + "blob_gen_cmd": "f'{AITER_CSRC_DIR}/ck_deepgemm/gen_instances.py --working_path {{}}'" + }, "module_gemm_a8w8_bpreshuffle_cktile": { "srcs": [ "f'{AITER_CSRC_DIR}/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu'", @@ -394,6 +413,24 @@ "hip_clang_path": "os.environ.get('GEMM_A4W4_BLOCKWISE_HIP_CLANG_PATH')", "blob_gen_cmd": "f'{AITER_CSRC_DIR}/ck_gemm_moe_2stages_codegen/gen_instances.py --working_path {{}}'" }, + "module_moe_cktile2stages": { + "srcs": [ + "f'{AITER_CSRC_DIR}/ck_tile_gemm_moe_2stages/moe_cktile2stages.cu'", + "f'{AITER_CSRC_DIR}/pybind/moe_cktile_2stages_pybind.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "md_name": "'module_moe_cktile2stages'", + "extra_ldflags": "None", + "extra_include": [ + "f'{AITER_CSRC_DIR}/ck_tile_gemm_moe_2stages/include'" + ], + "verbose": "False", + "is_python_module": "True", + "is_standalone": "False", + "hip_clang_path": "os.environ.get('FLATMM_HIP_CLANG_PATH')", + "blob_gen_cmd": "f'{AITER_CSRC_DIR}/ck_tile_gemm_moe_2stages/gen_instances.py --working_path {{}}'" + }, "module_moe_sorting": { "srcs": [ "f'{AITER_CSRC_DIR}/py_itfs_ck/moe_sorting_kernels.cu'", @@ -412,6 +449,22 @@ "verbose": "False", "blob_gen_cmd": "''" }, + "module_moe_topk": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/moe_topk_pybind.cu'", + "f'{AITER_CSRC_DIR}/py_itfs_ck/topk_sigmoid_kernels.cu'", + "f'{CK_DIR}/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [ + "f'{AITER_CSRC_DIR}/include/ck_tile'", + "f'{CK_DIR}/example/ck_tile/09_topk_softmax'" + ], + "verbose": "False", + "blob_gen_cmd": "''" + }, "module_norm": { "srcs": [ "f'{AITER_CSRC_DIR}/py_itfs_ck/norm_kernels.cu'", @@ -938,5 +991,45 @@ ], "verbose": "False", "blob_gen_cmd": "''" + }, + "module_top_k_per_row": { + "srcs": [ + "f'{AITER_CSRC_DIR}/kernels/topk_per_row_kernels.cu'", + "f'{AITER_CSRC_DIR}/pybind/topk_per_row_pybind.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_mla_metadata": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/mla_metadata_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/mla/metadata.cu'", + "f'{AITER_CSRC_DIR}/kernels/mla/metadata/v1_comm.cuh'", + "f'{AITER_CSRC_DIR}/kernels/mla/metadata/v1_1_device.cuh'", + "f'{AITER_CSRC_DIR}/kernels/mla/metadata/v1_1_host.cuh'", + "f'{AITER_CSRC_DIR}/kernels/mla/metadata/v1_2_device.cuh'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_mla_reduce": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/mla_reduce_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/mla/reduce.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "''" } } \ No newline at end of file From 305e2741771464204adf0af22a54c2eb912998b0 Mon Sep 17 00:00:00 2001 From: solin Date: Fri, 14 Nov 2025 04:06:19 +0000 Subject: [PATCH 08/13] fix conflict --- aiter/jit/core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/aiter/jit/core.py b/aiter/jit/core.py index 358bb73142..b310f95645 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -90,7 +90,7 @@ def update_config_files(file_path: str, merge_name: str): df_list.append(df) else: - print(f"path {i+1}: {path} (not exist)") + logger.info(f"path {i+1}: {path} (not exist)") merge_df = pd.concat(df_list, ignore_index=True) if df_list else pd.DataFrame() ## get keys from untuned file to drop_duplicates untuned_name = ( @@ -135,7 +135,9 @@ def get_config_file(env_name, default_file, tuned_file_name): else: tuned_files = ":".join(str(p) for p in op_tuned_file_list) tuned_files = default_file + ":" + tuned_files - print(f"merge tuned file under model_configs/ and configs/ ", tuned_files) + logger.info( + f"merge tuned file under model_configs/ and configs/ {tuned_files}" + ) config_file = update_config_files(tuned_files, tuned_file_name) else: config_file = update_config_files(config_env_file, tuned_file_name) From 829c6aa65f65ad76cfe3fc78fc5d86526e10cb36 Mon Sep 17 00:00:00 2001 From: solin Date: Fri, 14 Nov 2025 07:13:30 +0000 Subject: [PATCH 09/13] fix CI build fail --- .../gemm_a8w8_bpreshuffle_cktile_common.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py index 3795682f94..670e73894a 100644 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_common.py @@ -1,7 +1,20 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. from dataclasses import dataclass -from aiter.jit.utils.chip_info import get_gfx +import os +import sys + +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_CORE_DIR = os.path.abspath(f"{this_dir}/../../../") +if os.path.exists(os.path.join(AITER_CORE_DIR, "aiter_meta")): + AITER_CORE_DIR = os.path.join(AITER_CORE_DIR, "aiter/jit/utils") # pip install mode +else: + AITER_CORE_DIR = os.path.abspath( + f"{this_dir}/../../aiter/jit/utils" + ) # develop mode +sys.path.insert(0, AITER_CORE_DIR) + +from chip_info import get_gfx # noqa: E402 @dataclass From f312c3a654cc206dc5ba51f65a2de17e489d17ed Mon Sep 17 00:00:00 2001 From: solin Date: Mon, 17 Nov 2025 04:36:37 +0000 Subject: [PATCH 10/13] refine code --- aiter/jit/optCompilerConfig.json | 4 ++-- aiter/ops/gemm_op_a8w8.py | 8 ++++---- csrc/cktile_gemm_a8w8_bpreshuffle/README.md | 6 +++--- op_tests/test_gemm_a8w8.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json index 5a55f8b5e8..1d1e2f3e9f 100755 --- a/aiter/jit/optCompilerConfig.json +++ b/aiter/jit/optCompilerConfig.json @@ -304,7 +304,7 @@ "is_python_module": "True", "is_standalone": "False", "verbose": "False", - "hip_clang_path": "os.environ.get('GEMM_CKTILE_BPRESHUFFLE_HIP_CLANG_PATH')", + "hip_clang_path": "os.environ.get('FLATMM_HIP_CLANG_PATH')", "blob_gen_cmd": "f'{AITER_CSRC_DIR}/cktile_gemm_a8w8_bpreshuffle/gen_instances.py --working_path {{}} --tune_file {AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE_FILE}'" }, "module_gemm_a8w8_asm": { @@ -614,7 +614,7 @@ "f'{CK_DIR}/example/ck_tile/18_flatmm'" ], "verbose": "False", - "hip_clang_path": "os.environ.get('GEMM_CKTILE_BPRESHUFFLE_HIP_CLANG_PATH')", + "hip_clang_path": "os.environ.get('FLATMM_HIP_CLANG_PATH')", "is_python_module": "True", "is_standalone": "False", "blob_gen_cmd": "f'{AITER_CSRC_DIR}/cktile_gemm_a8w8_bpreshuffle/gen_instances.py --working_path {{}} --tune'" diff --git a/aiter/ops/gemm_op_a8w8.py b/aiter/ops/gemm_op_a8w8.py index 4bb3b9ea2e..de8ab92369 100644 --- a/aiter/ops/gemm_op_a8w8.py +++ b/aiter/ops/gemm_op_a8w8.py @@ -639,7 +639,7 @@ def gemm_a8w8_blockscale_bpreshuffle_tune( ) -> torch.Tensor: ... -def gemm_a8w8_bpreshuffle_CKTILE( +def gemm_a8w8_cktile_bpreshuffle( XQ: Tensor, WQ: Tensor, x_scale: Tensor, @@ -651,7 +651,7 @@ def gemm_a8w8_bpreshuffle_CKTILE( assert dtype in [ torch.bfloat16, torch.float16, - ], f"Output {dtype=} is currently not supported in gemm_a8w8_bpreshuffle_CKTILE" + ], f"Output {dtype=} is currently not supported in gemm_a8w8_cktile_bpreshuffle" m = XQ.shape[0] n = WQ.shape[0] k = XQ.shape[-1] @@ -660,8 +660,8 @@ def gemm_a8w8_bpreshuffle_CKTILE( # m, n, k, dtypes.fp8, AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE # ) get_CKGEMM_config(m, n, k, AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE_FILE) - assert WQ.dtype == dtypes.fp8, "gemm_a8w8_bpreshuffle_CKTILE only support fp8 now" - assert bias is None, "gemm_a8w8_bpreshuffle_CKTILE does not support bias now" + assert WQ.dtype == dtypes.fp8, "gemm_a8w8_cktile_bpreshuffle only support fp8 now" + assert bias is None, "gemm_a8w8_cktile_bpreshuffle does not support bias now" Y = torch.empty(m, n, dtype=dtype, device=XQ.device) return gemm_a8w8_bpreshuffle_cktile(XQ, WQ, x_scale, w_scale, Y) diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/README.md b/csrc/cktile_gemm_a8w8_bpreshuffle/README.md index 571632fe98..bba94625eb 100644 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/README.md +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/README.md @@ -5,12 +5,12 @@ 2. Tune gemm a8w8: First add GEMM shapes in `aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv`, then run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_bpreshuffle_cktile_tune via jit: -`GEMM_CKTILE_BPRESHUFFLE_HIP_CLANG_PATH=/data/llvm-project/build/bin/ python3 csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py -i aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv -o aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv` +`FLATMM_HIP_CLANG_PATH=/data/llvm-project/build/bin/ python3 csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py -i aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv -o aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv` If you want to use split K kernels, you can add the `-k` parameter at the end, notice that should change `bias` to `bias/(2^k)`. You can find the results of the tuning in `aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv`. -3. Test the performance, modify the test instance in `op_tests/testflatmm.py` and run it, please wait a few minutes as it will build gemm_a8w8_bpreshuffle_cktile kernels in `aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv` via jit: -`GEMM_CKTILE_BPRESHUFFLE_HIP_CLANG_PATH=/data/llvm-project/build/bin/ python3 op_tests/testflatmm.py` +3. Test the performance, modify the test instance in `op_tests/test_gemm_a8w8.py` and run it, please wait a few minutes as it will build gemm_a8w8_bpreshuffle_cktile kernels in `aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv` via jit: +`FLATMM_HIP_CLANG_PATH=/data/llvm-project/build/bin/ python3 op_tests/test_gemm_a8w8.py` ## More diff --git a/op_tests/test_gemm_a8w8.py b/op_tests/test_gemm_a8w8.py index e5e695e004..8f0691e8e0 100755 --- a/op_tests/test_gemm_a8w8.py +++ b/op_tests/test_gemm_a8w8.py @@ -59,7 +59,7 @@ def run_gemm_ck_bpreshuffle(x, weight, x_scale, w_scale, dtype=dtypes.bf16): @perftest() def run_gemm_cktile_bpreshuffle(x, weight, x_scale, w_scale, dtype=dtypes.bf16): - return aiter.gemm_a8w8_bpreshuffle_CKTILE(x, weight, x_scale, w_scale, None, dtype) + return aiter.gemm_a8w8_cktile_bpreshuffle(x, weight, x_scale, w_scale, None, dtype) @perftest() From a6ab41bc5b9d5557b04729a2907de189675b1d1d Mon Sep 17 00:00:00 2001 From: solin Date: Tue, 18 Nov 2025 15:24:16 +0000 Subject: [PATCH 11/13] align aiter interface --- .../a8w8_bpreshuffle_cktile_tuned_gemm.csv | 465 +++++++++--------- .../a8w8_bpreshuffle_cktile_untuned_gemm.csv | 465 +++++++++--------- aiter/ops/gemm_op_a8w8.py | 65 +-- csrc/cktile_gemm_a8w8_bpreshuffle/README.md | 2 +- .../gemm_a8w8_bpreshuffle_cktile_tune.cu | 4 +- .../gemm_a8w8_bpreshuffle_cktile_tune.py | 26 +- .../gen_instances.py | 4 +- .../gemm_a8w8_bpreshuffle_cktile_common.cuh | 4 +- op_tests/test_gemm_a8w8.py | 16 - 9 files changed, 500 insertions(+), 551 deletions(-) mode change 100755 => 100644 aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv diff --git a/aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv b/aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv index 3ff676d588..d058eafe65 100644 --- a/aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv +++ b/aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv @@ -1,244 +1,221 @@ -cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio -80,1,9216,4096,30,0,12.9048,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,5.85,2926.92,0.0 -80,2,9216,4096,2,0,12.9388,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,11.67,2920.97,0.0 -80,4,9216,4096,2,0,12.9816,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,23.26,2914.81,0.0 -80,8,9216,4096,30,0,13.1572,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,45.9,2882.75,0.0 -80,16,9216,4096,2,0,14.0139,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,86.2,2719.38,0.0 -80,32,9216,4096,9,0,16.7096,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,144.58,2302.25,0.0 -80,64,9216,4096,22,0,24.5395,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x256_1x4x1_16x16x64_default,196.9,1597.04,0.0 -80,128,9216,4096,24,0,35.4998,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,272.22,1144.58,0.0 -80,256,9216,4096,68,0,63.803,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x256_1x4x1_16x16x64_default,302.92,682.04,0.0 -80,1024,9216,4096,54,0,211.9977,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,364.67,286.88,0.0 -80,2048,9216,4096,54,0,410.3007,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,376.84,204.45,0.0 -80,4096,9216,4096,54,0,786.7339,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,393.07,165.27,0.0 -80,4240,9216,4096,54,0,829.5777,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,385.87,160.64,0.0 -80,16384,9216,4096,54,0,3084.9868,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,400.96,131.88,0.0 -80,32768,9216,4096,54,0,6135.9085,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,403.18,126.46,0.0 -80,1,4608,4096,30,0,9.9687,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,3.79,1894.7,0.0 -80,2,4608,4096,2,0,9.8532,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,7.66,1918.26,0.0 -80,4,4608,4096,2,0,10.3186,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,14.63,1834.32,0.0 -80,8,4608,4096,30,0,10.2518,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,29.46,1851.47,0.0 -80,16,4608,4096,9,0,10.1797,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,59.33,1875.04,0.0 -80,32,4608,4096,37,0,11.0049,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,109.77,1753.8,0.0 -80,64,4608,4096,23,0,15.8202,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,152.71,1246.91,0.0 -80,128,4608,4096,24,0,22.8823,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,211.16,899.31,0.0 -80,256,4608,4096,24,0,35.1223,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,275.14,634.42,0.0 -80,1024,4608,4096,26,0,111.4377,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x128_1x4x1_16x16x64_default,346.87,291.7,0.0 -80,2048,4608,4096,54,0,210.7297,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,366.87,218.94,0.0 -80,4096,4608,4096,54,0,408.7369,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,378.28,179.58,0.0 -80,16384,4608,4096,54,0,1545.3929,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,400.21,153.34,0.0 -80,32768,4608,4096,54,0,3099.2964,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,399.11,146.83,0.0 -80,1,1280,8192,2,0,12.7965,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,1.64,820.26,0.0 -80,32,1280,8192,2,0,13.6423,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,49.19,793.84,0.0 -80,64,1280,8192,2,0,13.4425,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,99.85,831.24,0.0 -80,128,1280,8192,9,0,16.5571,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,162.13,716.43,0.0 -80,192,1280,8192,23,0,24.9874,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,161.14,502.26,0.0 -80,256,1280,8192,23,0,25.3858,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,211.48,521.48,0.0 -80,320,1280,8192,23,0,36.0155,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,186.33,386.68,0.0 -80,512,1280,8192,24,0,39.0062,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,275.27,409.95,0.0 -80,1024,1280,8192,24,0,62.422,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,344.03,344.36,0.0 -80,2048,1280,8192,68,0,115.5764,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x256_1x4x1_16x16x64_default,371.61,281.25,0.0 -80,4096,1280,8192,68,0,224.3581,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x256_1x4x1_16x16x64_default,382.87,243.03,0.0 -80,8192,1280,8192,48,0,434.5678,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,395.33,226.81,0.0 -80,16384,1280,8192,48,0,855.908,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,401.44,218.07,0.0 -80,1,8192,1024,33,0,6.6064,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x256_1x4x1_16x16x64_default,2.54,1272.4,0.0 -80,32,8192,1024,37,0,6.9748,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,76.97,1282.57,0.0 -80,64,8192,1024,23,0,10.4742,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,102.51,907.25,0.0 -80,128,8192,1024,24,0,14.6067,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,147.02,726.85,0.0 -80,192,8192,1024,51,0,18.4464,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x256_1x4x1_16x16x64_default,174.63,635.95,0.0 -80,256,8192,1024,46,0,22.7833,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,188.51,563.79,0.0 -80,320,8192,1024,48,0,24.6382,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,217.9,566.57,0.0 -80,512,8192,1024,46,0,37.6643,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,228.07,459.36,0.0 -80,1024,8192,1024,46,0,65.3665,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,262.82,401.04,0.0 -80,2048,8192,1024,41,0,117.7909,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,291.7,373.88,0.0 -80,4096,8192,1024,41,0,222.3966,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,309.0,358.33,0.0 -80,8192,8192,1024,41,0,423.1597,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,324.79,356.83,0.0 -80,16384,8192,1024,41,0,821.258,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,334.7,357.5,0.0 -80,16,1536,7168,2,0,11.9166,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,29.57,937.67,0.0 -80,32,1536,7168,2,0,12.2456,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,57.54,925.86,0.0 -80,64,1536,7168,9,0,14.8021,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,95.21,788.09,0.0 -80,128,1536,7168,21,0,20.5898,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,136.89,598.39,0.0 -80,256,1536,7168,66,0,29.7058,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x192x256_1x4x1_16x16x64_default,189.77,458.88,0.0 -80,512,1536,7168,58,0,44.6328,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,252.6,364.15,0.0 -80,1024,1536,7168,58,0,77.1886,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,292.12,278.48,0.0 -80,1536,1536,7168,20,0,101.7773,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x256x128_1x4x1_16x16x64_default,332.32,262.72,0.0 -80,2048,1536,7168,24,0,130.8105,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,344.75,244.49,0.0 -80,4096,1536,7168,48,0,242.9667,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,371.22,217.94,0.0 -80,8192,1536,7168,54,0,461.2919,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,391.05,205.72,0.0 -80,16384,1536,7168,54,0,905.9909,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,398.21,197.33,0.0 -80,20480,1536,7168,48,0,1118.0539,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,403.35,197.42,0.0 -80,16,3072,1536,2,0,5.9618,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,25.33,812.08,0.0 -80,32,3072,1536,9,0,6.7498,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,44.74,735.48,0.0 -80,64,3072,1536,21,0,8.9754,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,67.29,580.49,0.0 -80,128,3072,1536,21,0,11.1207,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,108.62,512.7,0.0 -80,256,3072,1536,57,0,15.8689,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x192x256_1x4x1_16x16x64_default,152.24,421.24,0.0 -80,512,3072,1536,23,0,23.6895,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,203.97,365.17,0.0 -80,1024,3072,1536,46,0,39.0277,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,247.61,322.41,0.0 -80,1536,3072,1536,54,0,50.9012,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,284.78,324.45,0.0 -80,2048,3072,1536,48,0,65.8281,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,293.6,310.62,0.0 -80,4096,3072,1536,54,0,117.4866,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,329.01,307.91,0.0 -80,8192,3072,1536,54,0,225.0856,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,343.47,300.48,0.0 -80,16384,3072,1536,54,0,438.9518,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,352.25,297.41,0.0 -80,20480,3072,1536,54,0,542.6733,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,356.15,298.53,0.0 -80,16,576,7168,2,0,12.0528,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,10.96,353.6,0.0 -80,32,576,7168,30,0,12.3251,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,21.44,356.59,0.0 -80,64,576,7168,2,0,12.2075,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,43.29,381.83,0.0 -80,128,576,7168,2,0,12.3514,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,85.57,420.5,0.0 -80,256,576,7168,9,0,14.9376,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,141.52,418.99,0.0 -80,512,576,7168,9,0,22.5703,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,187.32,371.67,0.0 -80,1024,576,7168,22,0,35.4728,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x256_1x4x1_16x16x64_default,238.37,356.57,0.0 -80,1536,576,7168,58,0,44.4162,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,285.56,380.68,0.0 -80,2048,576,7168,131,0,65.7335,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x192x256_1x4x1_16x16x64_default,257.27,322.03,0.0 -80,4096,576,7168,74,0,107.8111,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_160x192x128_1x4x1_16x16x64_default,313.72,354.39,0.0 -80,8192,576,7168,58,0,186.7817,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,362.16,387.01,0.0 -80,16384,576,7168,54,0,359.7223,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,376.1,390.42,0.0 -80,20480,576,7168,54,0,436.2202,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,387.68,400.08,0.0 -80,16,7168,2048,2,0,9.1742,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,51.2,1628.72,0.0 -80,32,7168,2048,49,0,10.7453,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,87.44,1414.98,0.0 -80,64,7168,2048,49,0,13.6786,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,137.37,1149.87,0.0 -80,128,7168,2048,23,0,19.541,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,192.32,858.56,0.0 -80,256,7168,2048,18,0,30.4226,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,247.06,620.41,0.0 -80,512,7168,2048,48,0,52.0579,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,288.76,443.13,0.0 -80,1024,7168,2048,48,0,96.1383,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,312.72,327.21,0.0 -80,1536,7168,2048,46,0,135.3627,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,333.16,294.36,0.0 -80,2048,7168,2048,48,0,180.4336,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,333.25,267.33,0.0 -80,4096,7168,2048,41,0,338.9949,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,354.75,241.27,0.0 -80,8192,7168,2048,41,0,651.1562,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,369.37,228.67,0.0 -80,16384,7168,2048,41,0,1282.8862,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,374.96,220.69,0.0 -80,20480,7168,2048,41,0,1602.6372,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,375.19,218.53,0.0 -80,16,4608,7168,9,0,14.6651,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,72.07,2270.17,0.0 -80,32,4608,7168,9,0,14.9106,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,141.77,2250.37,0.0 -80,64,4608,7168,23,0,23.5169,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,179.78,1449.12,0.0 -80,128,4608,7168,24,0,35.1222,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,240.75,1000.15,0.0 -80,256,4608,7168,24,0,55.8852,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,302.61,666.09,0.0 -80,512,4608,7168,0,0,103.0634,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x128_1x4x1_16x16x64_default,328.18,401.88,0.0 -80,1024,4608,7168,26,0,184.555,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x128_1x4x1_16x16x64_default,366.53,269.88,0.0 -80,1536,4608,7168,18,0,272.9456,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,371.75,213.21,0.0 -80,2048,4608,7168,54,0,354.6475,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,381.48,187.75,0.0 -80,4096,4608,7168,54,0,692.6798,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,390.63,144.57,0.0 -80,8192,4608,7168,48,0,1343.0841,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,402.93,124.53,0.0 -80,16384,4608,7168,54,0,2639.293,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,410.08,114.22,0.0 -80,20480,4608,7168,54,0,3276.6577,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,412.89,112.48,0.0 -80,16,7168,2304,21,0,11.0122,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,47.99,1523.88,0.0 -80,32,7168,2304,49,0,11.1539,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,94.76,1528.39,0.0 -80,64,7168,2304,49,0,14.327,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,147.55,1227.06,0.0 -80,128,7168,2304,23,0,21.2273,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,199.17,878.35,0.0 -80,256,7168,2304,46,0,31.7244,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,266.54,654.86,0.0 -80,512,7168,2304,48,0,56.153,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,301.17,445.83,0.0 -80,1024,7168,2304,48,0,103.9455,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,325.39,322.81,0.0 -80,1536,7168,2304,46,0,144.8029,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,350.37,290.56,0.0 -80,2048,7168,2304,46,0,193.0686,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,350.37,262.05,0.0 -80,4096,7168,2304,41,0,366.5239,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,369.12,231.01,0.0 -80,8192,7168,2304,41,0,709.1492,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,381.56,215.51,0.0 -80,16384,7168,2304,41,0,1404.7256,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,385.25,205.84,0.0 -80,20480,7168,2304,41,0,1741.0107,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,388.54,205.23,0.0 -80,16,512,7168,30,0,11.9822,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,9.8,317.23,0.0 -80,32,512,7168,2,0,12.2803,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,19.13,320.2,0.0 -80,64,512,7168,30,0,12.315,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,38.15,340.58,0.0 -80,128,512,7168,2,0,12.3314,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,76.19,382.65,0.0 -80,256,512,7168,9,0,14.8513,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,126.52,388.33,0.0 -80,512,512,7168,37,0,22.2836,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,168.65,352.92,0.0 -80,1024,512,7168,24,0,34.5858,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,217.32,348.66,0.0 -80,1536,512,7168,99,0,47.427,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x128x256_1x4x1_16x16x64_default,237.72,342.69,0.0 -80,2048,512,7168,24,0,54.6059,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,275.29,374.45,0.0 -80,4096,512,7168,18,0,101.3206,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,296.73,367.39,0.0 -80,8192,512,7168,24,0,180.3432,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,333.42,392.47,0.0 -80,16384,512,7168,24,0,332.6272,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,361.54,414.54,0.0 -80,20480,512,7168,48,0,387.5191,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,387.91,442.41,0.0 -80,16,4096,512,30,0,3.9474,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,17.0,566.55,0.0 -80,32,4096,512,49,0,4.2976,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,31.23,552.79,0.0 -80,64,4096,512,49,0,5.4494,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,49.26,487.06,0.0 -80,128,4096,512,49,0,6.8127,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,78.8,471.36,0.0 -80,256,4096,512,15,0,10.6842,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x128_1x4x1_16x16x64_default,100.5,404.84,0.0 -80,512,4096,512,45,0,15.5946,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,137.71,420.25,0.0 -80,1024,4096,512,46,0,25.069,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,171.33,439.19,0.0 -80,1536,4096,512,46,0,33.2391,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,193.82,465.31,0.0 -80,2048,4096,512,46,0,42.7751,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,200.82,465.76,0.0 -80,4096,4096,512,41,0,75.405,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,227.83,500.61,0.0 -80,8192,4096,512,41,0,137.5652,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,249.77,533.57,0.0 -80,16384,4096,512,41,0,262.5162,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,261.77,551.22,0.0 -80,20480,4096,512,41,0,322.8676,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,266.05,558.6,0.0 -80,16,7168,256,33,0,4.1533,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x256_1x4x1_16x16x64_default,14.14,498.03,0.0 -80,32,7168,256,15,0,4.3338,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x128_1x4x1_16x16x64_default,27.1,531.16,0.0 -80,64,7168,256,16,0,5.6176,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x128_1x4x1_16x16x64_default,41.81,492.9,0.0 -80,128,7168,256,45,0,7.0727,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,66.42,523.53,0.0 -80,256,7168,256,18,0,11.1402,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,84.34,500.04,0.0 -80,512,7168,256,45,0,16.3518,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,114.91,569.12,0.0 -80,1024,7168,256,45,0,27.5099,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,136.61,609.86,0.0 -80,1536,7168,256,45,0,36.8025,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,153.17,658.88,0.0 -80,2048,7168,256,41,0,47.1463,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,159.42,672.79,0.0 -80,4096,7168,256,41,0,85.1181,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,176.61,723.75,0.0 -80,8192,7168,256,41,0,156.5563,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,192.04,775.27,0.0 -80,16384,7168,256,41,0,301.4756,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,199.45,799.1,0.0 -80,20480,7168,256,41,0,373.7383,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,201.11,804.52,0.0 -80,32,7168,1536,51,0,9.8996,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x256_1x4x1_16x16x64_default,71.18,1163.48,0.0 -80,32,7168,576,41,0,13.2274,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,19.98,348.21,0.0 -80,32,2048,7168,2,0,12.4173,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,75.66,1211.25,0.0 -80,32,7168,512,49,0,5.5426,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,42.38,747.87,0.0 -80,32,256,7168,2,0,10.8848,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,10.79,191.16,0.0 -80,64,7168,1536,21,0,11.5816,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,121.68,1038.36,0.0 -80,64,7168,576,41,0,13.6023,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,38.85,373.7,0.0 -80,64,2048,7168,9,0,14.9748,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,125.48,1028.46,0.0 -80,64,7168,512,21,0,6.7758,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,69.33,681.88,0.0 -80,64,256,7168,2,0,11.4051,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,20.59,203.99,0.0 -80,96,7168,1536,17,0,14.9313,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x128_1x4x1_16x16x64_default,141.58,839.43,0.0 -80,96,7168,576,41,0,14.1469,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,56.04,393.04,0.0 -80,96,2048,7168,21,0,20.7002,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,136.16,761.41,0.0 -80,96,7168,512,17,0,8.226,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x128_1x4x1_16x16x64_default,85.66,619.43,0.0 -80,96,256,7168,30,0,11.4763,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,30.7,224.14,0.0 -80,128,7168,1536,51,0,16.5378,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x256_1x4x1_16x16x64_default,170.43,788.6,0.0 -80,128,7168,576,41,0,14.7401,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,71.71,409.6,0.0 -80,128,2048,7168,9,0,22.4085,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,167.71,719.45,0.0 -80,128,7168,512,23,0,10.1582,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,92.49,548.38,0.0 -80,128,256,7168,30,0,11.8782,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,39.55,237.25,0.0 -80,256,7168,1536,46,0,25.3317,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,222.53,595.04,0.0 -80,256,7168,576,71,0,17.4843,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x64_1x4x1_16x16x64_default,120.9,454.48,0.0 -80,256,2048,7168,24,0,34.809,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,215.93,504.57,0.0 -80,256,7168,512,18,0,14.0471,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,133.77,531.86,0.0 -80,256,256,7168,2,0,12.0719,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,77.83,314.87,0.0 -80,512,7168,1536,48,0,42.1033,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,267.78,454.51,0.0 -80,512,7168,576,71,0,22.909,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x64_1x4x1_16x16x64_default,184.55,513.5,0.0 -80,512,2048,7168,24,0,55.1913,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,272.37,370.48,0.0 -80,512,7168,512,48,0,22.4995,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,167.03,501.0,0.0 -80,512,256,7168,9,0,14.8883,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,126.21,387.36,0.0 -80,1024,7168,1536,48,0,77.1567,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,292.24,353.35,0.0 -80,1024,7168,576,41,0,39.3022,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,215.15,493.58,0.0 -80,1024,2048,7168,18,0,100.866,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,298.07,259.89,0.0 -80,1024,7168,512,46,0,39.1797,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,191.84,481.74,0.0 -80,1024,256,7168,9,0,22.0293,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,170.6,440.29,0.0 -80,2048,7168,1536,41,0,143.6136,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,314.02,303.01,0.0 -80,2048,7168,576,41,0,69.2915,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,244.06,500.33,0.0 -80,2048,2048,7168,18,0,177.5591,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,338.65,212.6,0.0 -80,2048,7168,512,41,0,68.617,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,219.08,496.65,0.0 -80,2048,256,7168,24,0,35.1531,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,213.81,499.63,0.0 -80,4096,7168,1536,41,0,264.7829,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,340.63,287.11,0.0 -80,4096,7168,576,41,0,125.0143,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,270.55,521.61,0.0 -80,4096,2048,7168,18,0,322.9083,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,372.42,188.34,0.0 -80,4096,7168,512,41,0,124.4324,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,241.62,518.25,0.0 -80,4096,256,7168,24,0,55.334,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,271.67,601.66,0.0 -80,8192,7168,1536,41,0,503.6996,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,358.13,280.0,0.0 -80,8192,7168,576,41,0,236.1366,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,286.47,534.81,0.0 -80,8192,2048,7168,48,0,610.2692,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,394.12,175.26,0.0 -80,8192,7168,512,41,0,230.7146,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,260.62,543.12,0.0 -80,8192,256,7168,68,0,102.6022,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x256_1x4x1_16x16x64_default,293.02,631.07,0.0 -80,16384,7168,1536,41,0,995.5304,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,362.4,272.27,0.0 -80,16384,7168,576,41,0,461.9614,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,292.86,537.81,0.0 -80,16384,2048,7168,48,0,1211.3868,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,397.1,164.46,0.0 -80,16384,7168,512,41,0,448.4529,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,268.16,550.65,0.0 -80,16384,256,7168,24,0,182.3289,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,329.79,700.19,0.0 -80,5112,6912,5120,54,0,899.3494,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,402.31,147.03,0.0 -80,5104,5120,8192,48,0,1053.0629,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,406.58,129.17,0.0 -80,2048,4096,5120,18,0,234.6564,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,366.06,205.55,0.0 -80,5120,5120,4096,48,0,547.1132,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,392.51,172.49,0.0 -80,5120,5120,8192,48,0,1062.5069,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,404.23,128.3,0.0 -80,32,2112,7168,2,0,12.3951,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,78.17,1250.77,0.0 -80,64,2112,7168,9,0,14.9879,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,129.29,1058.71,0.0 -80,96,2112,7168,9,0,22.2991,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,130.35,727.94,0.0 -80,128,2112,7168,9,0,22.6786,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,170.89,731.83,0.0 -80,256,2112,7168,22,0,35.8442,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x256_1x4x1_16x16x64_default,216.24,503.71,0.0 -80,512,2112,7168,131,0,65.1728,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x192x256_1x4x1_16x16x64_default,237.86,321.78,0.0 -80,1024,2112,7168,121,0,107.0765,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x64_default,289.55,250.33,0.0 -80,2048,2112,7168,26,0,183.0673,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x128_1x4x1_16x16x64_default,338.72,210.14,0.0 -80,4096,2112,7168,54,0,319.8768,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,387.7,193.2,0.0 -80,8192,2112,7168,54,0,628.2407,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,394.81,172.64,0.0 -80,16384,2112,7168,54,0,1249.322,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,397.07,161.52,0.0 +cu_num,M,N,K,q_dtype_w,kernelId,splitK,us,kernelName,tflops,bw,errRatio +80,1,9216,4096,torch.float8_e4m3fnuz,30,0,13.5714,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,5.56,2783.15,0.0 +80,2,9216,4096,torch.float8_e4m3fnuz,2,0,14.0907,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,10.72,2682.18,0.0 +80,4,9216,4096,torch.float8_e4m3fnuz,2,0,13.9599,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,21.63,2710.54,0.0 +80,8,9216,4096,torch.float8_e4m3fnuz,2,0,13.9876,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,43.18,2711.61,0.0 +80,16,9216,4096,torch.float8_e4m3fnuz,30,0,14.8257,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,81.48,2570.48,0.0 +80,32,9216,4096,torch.float8_e4m3fnuz,9,0,17.4829,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,138.19,2200.41,0.0 +80,64,9216,4096,torch.float8_e4m3fnuz,22,0,24.8908,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x256_1x4x1_16x16x64_default,194.12,1574.5,0.0 +80,128,9216,4096,torch.float8_e4m3fnuz,24,0,37.3315,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,258.86,1088.42,0.0 +80,256,9216,4096,torch.float8_e4m3fnuz,0,0,66.9303,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x128_1x4x1_16x16x64_default,288.77,650.17,0.0 +80,1024,9216,4096,torch.float8_e4m3fnuz,54,0,217.4525,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,355.52,279.68,0.0 +80,2048,9216,4096,torch.float8_e4m3fnuz,54,0,421.0536,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,367.22,199.23,0.0 +80,4096,9216,4096,torch.float8_e4m3fnuz,54,0,814.1634,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,379.82,159.7,0.0 +80,4240,9216,4096,torch.float8_e4m3fnuz,54,0,857.5538,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,373.28,155.4,0.0 +80,16384,9216,4096,torch.float8_e4m3fnuz,54,0,3176.6345,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,389.39,128.08,0.0 +80,32768,9216,4096,torch.float8_e4m3fnuz,54,0,6363.4917,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,388.76,121.94,0.0 +80,1,4608,4096,torch.float8_e4m3fnuz,2,0,10.0986,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,3.74,1870.33,0.0 +80,2,4608,4096,torch.float8_e4m3fnuz,2,0,9.7346,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,7.76,1941.63,0.0 +80,4,4608,4096,torch.float8_e4m3fnuz,30,0,9.434,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,16.01,2006.32,0.0 +80,8,4608,4096,torch.float8_e4m3fnuz,30,0,10.005,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,30.18,1897.14,0.0 +80,16,4608,4096,torch.float8_e4m3fnuz,30,0,9.6456,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,62.62,1978.87,0.0 +80,32,4608,4096,torch.float8_e4m3fnuz,9,0,11.8074,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,102.31,1634.6,0.0 +80,64,4608,4096,torch.float8_e4m3fnuz,23,0,16.1455,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,149.63,1221.79,0.0 +80,128,4608,4096,torch.float8_e4m3fnuz,24,0,23.9186,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,202.01,860.35,0.0 +80,256,4608,4096,torch.float8_e4m3fnuz,24,0,37.1547,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,260.09,599.72,0.0 +80,1024,4608,4096,torch.float8_e4m3fnuz,26,0,116.0039,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x128_1x4x1_16x16x64_default,333.22,280.21,0.0 +80,2048,4608,4096,torch.float8_e4m3fnuz,54,0,215.9621,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,357.98,213.64,0.0 +80,4096,4608,4096,torch.float8_e4m3fnuz,54,0,420.4927,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,367.71,174.56,0.0 +80,16384,4608,4096,torch.float8_e4m3fnuz,54,0,1606.2618,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,385.04,147.53,0.0 +80,32768,4608,4096,torch.float8_e4m3fnuz,54,0,3174.959,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,389.6,143.33,0.0 +80,1,1280,8192,torch.float8_e4m3fnuz,2,0,13.6383,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,1.54,769.63,0.0 +80,32,1280,8192,torch.float8_e4m3fnuz,2,0,13.9553,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,48.09,776.04,0.0 +80,64,1280,8192,torch.float8_e4m3fnuz,2,0,14.4176,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,93.09,775.02,0.0 +80,128,1280,8192,torch.float8_e4m3fnuz,9,0,17.7099,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,151.57,669.8,0.0 +80,192,1280,8192,torch.float8_e4m3fnuz,37,0,25.2086,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,159.73,497.85,0.0 +80,256,1280,8192,torch.float8_e4m3fnuz,23,0,25.7499,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,208.49,514.11,0.0 +80,320,1280,8192,torch.float8_e4m3fnuz,23,0,36.6913,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,182.9,379.56,0.0 +80,512,1280,8192,torch.float8_e4m3fnuz,60,0,40.6609,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x256x256_1x4x1_16x16x64_default,264.07,393.27,0.0 +80,1024,1280,8192,torch.float8_e4m3fnuz,24,0,63.7933,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,336.63,336.96,0.0 +80,2048,1280,8192,torch.float8_e4m3fnuz,20,0,118.4338,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x256x128_1x4x1_16x16x64_default,362.65,274.46,0.0 +80,4096,1280,8192,torch.float8_e4m3fnuz,68,0,229.0696,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x256_1x4x1_16x16x64_default,374.99,238.03,0.0 +80,8192,1280,8192,torch.float8_e4m3fnuz,48,0,438.2764,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,391.99,224.89,0.0 +80,16384,1280,8192,torch.float8_e4m3fnuz,48,0,863.6412,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,397.85,216.12,0.0 +80,1,8192,1024,torch.float8_e4m3fnuz,30,0,6.807,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,2.46,1234.91,0.0 +80,32,8192,1024,torch.float8_e4m3fnuz,49,0,8.0046,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,67.07,1117.57,0.0 +80,64,8192,1024,torch.float8_e4m3fnuz,50,0,10.9114,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x64x256_1x4x1_16x16x64_default,98.41,870.9,0.0 +80,128,8192,1024,torch.float8_e4m3fnuz,24,0,16.2559,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,132.1,653.11,0.0 +80,192,8192,1024,torch.float8_e4m3fnuz,51,0,20.526,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x256_1x4x1_16x16x64_default,156.93,571.52,0.0 +80,256,8192,1024,torch.float8_e4m3fnuz,18,0,24.7986,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,173.19,517.98,0.0 +80,320,8192,1024,torch.float8_e4m3fnuz,18,0,27.2902,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,196.73,511.51,0.0 +80,512,8192,1024,torch.float8_e4m3fnuz,46,0,41.0662,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,209.17,421.31,0.0 +80,1024,8192,1024,torch.float8_e4m3fnuz,41,0,73.8261,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,232.71,355.08,0.0 +80,2048,8192,1024,torch.float8_e4m3fnuz,41,0,130.8248,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,262.64,336.63,0.0 +80,4096,8192,1024,torch.float8_e4m3fnuz,41,0,244.4845,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,281.08,325.96,0.0 +80,8192,8192,1024,torch.float8_e4m3fnuz,41,0,468.0259,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,293.66,322.62,0.0 +80,16384,8192,1024,torch.float8_e4m3fnuz,41,0,910.2301,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,301.99,322.56,0.0 +80,16,1536,7168,torch.float8_e4m3fnuz,2,0,12.8707,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,27.37,868.16,0.0 +80,32,1536,7168,torch.float8_e4m3fnuz,2,0,13.6657,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,51.56,829.65,0.0 +80,64,1536,7168,torch.float8_e4m3fnuz,37,0,15.7756,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,89.33,739.46,0.0 +80,128,1536,7168,torch.float8_e4m3fnuz,49,0,21.3394,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,132.08,577.37,0.0 +80,256,1536,7168,torch.float8_e4m3fnuz,57,0,29.7781,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x192x256_1x4x1_16x16x64_default,189.31,457.77,0.0 +80,512,1536,7168,torch.float8_e4m3fnuz,58,0,45.9558,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,245.33,353.66,0.0 +80,1024,1536,7168,torch.float8_e4m3fnuz,58,0,79.0489,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,285.25,271.93,0.0 +80,1536,1536,7168,torch.float8_e4m3fnuz,0,0,104.6251,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x128_1x4x1_16x16x64_default,323.28,255.57,0.0 +80,2048,1536,7168,torch.float8_e4m3fnuz,24,0,134.4692,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,335.37,237.84,0.0 +80,4096,1536,7168,torch.float8_e4m3fnuz,48,0,246.2629,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,366.25,215.03,0.0 +80,8192,1536,7168,torch.float8_e4m3fnuz,54,0,467.0737,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,386.21,203.17,0.0 +80,16384,1536,7168,torch.float8_e4m3fnuz,54,0,915.0814,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,394.26,195.37,0.0 +80,20480,1536,7168,torch.float8_e4m3fnuz,54,0,1130.2865,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,398.99,195.28,0.0 +80,16,3072,1536,torch.float8_e4m3fnuz,30,0,6.0562,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,24.93,799.42,0.0 +80,32,3072,1536,torch.float8_e4m3fnuz,37,0,7.0704,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,42.71,702.13,0.0 +80,64,3072,1536,torch.float8_e4m3fnuz,49,0,9.2954,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,64.98,560.5,0.0 +80,128,3072,1536,torch.float8_e4m3fnuz,23,0,12.03,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,100.41,473.95,0.0 +80,256,3072,1536,torch.float8_e4m3fnuz,57,0,16.8707,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x192x256_1x4x1_16x16x64_default,143.2,396.23,0.0 +80,512,3072,1536,torch.float8_e4m3fnuz,23,0,25.0877,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,192.6,344.82,0.0 +80,1024,3072,1536,torch.float8_e4m3fnuz,46,0,42.7756,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,225.92,294.16,0.0 +80,1536,3072,1536,torch.float8_e4m3fnuz,54,0,54.8776,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,264.14,300.94,0.0 +80,2048,3072,1536,torch.float8_e4m3fnuz,54,0,71.0938,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,271.86,287.61,0.0 +80,4096,3072,1536,torch.float8_e4m3fnuz,54,0,127.4567,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,303.28,283.83,0.0 +80,8192,3072,1536,torch.float8_e4m3fnuz,54,0,241.298,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,320.39,280.29,0.0 +80,16384,3072,1536,torch.float8_e4m3fnuz,54,0,470.5864,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,328.57,277.41,0.0 +80,20480,3072,1536,torch.float8_e4m3fnuz,54,0,586.3499,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,329.62,276.29,0.0 +80,16,576,7168,torch.float8_e4m3fnuz,2,0,12.195,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,10.83,349.48,0.0 +80,32,576,7168,torch.float8_e4m3fnuz,2,0,12.3508,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,21.39,355.85,0.0 +80,64,576,7168,torch.float8_e4m3fnuz,2,0,12.6815,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,41.67,367.56,0.0 +80,128,576,7168,torch.float8_e4m3fnuz,30,0,12.9793,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,81.43,400.15,0.0 +80,256,576,7168,torch.float8_e4m3fnuz,37,0,15.7506,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,134.21,397.36,0.0 +80,512,576,7168,torch.float8_e4m3fnuz,9,0,22.8675,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,184.89,366.84,0.0 +80,1024,576,7168,torch.float8_e4m3fnuz,50,0,36.5975,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x64x256_1x4x1_16x16x64_default,231.05,345.61,0.0 +80,1536,576,7168,torch.float8_e4m3fnuz,58,0,45.6058,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,278.11,370.75,0.0 +80,2048,576,7168,torch.float8_e4m3fnuz,131,0,67.4424,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x192x256_1x4x1_16x16x64_default,250.75,313.87,0.0 +80,4096,576,7168,torch.float8_e4m3fnuz,88,0,110.3714,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_160x192x128_1x4x1_16x16x64_default,306.45,346.17,0.0 +80,8192,576,7168,torch.float8_e4m3fnuz,58,0,190.2246,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x256_1x4x1_16x16x64_default,355.61,380.0,0.0 +80,16384,576,7168,torch.float8_e4m3fnuz,54,0,365.3289,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,370.33,384.43,0.0 +80,20480,576,7168,torch.float8_e4m3fnuz,54,0,439.7549,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,384.56,396.86,0.0 +80,16,7168,2048,torch.float8_e4m3fnuz,30,0,8.7391,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,53.75,1709.81,0.0 +80,32,7168,2048,torch.float8_e4m3fnuz,9,0,10.512,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,89.38,1446.38,0.0 +80,64,7168,2048,torch.float8_e4m3fnuz,50,0,14.8952,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x64x256_1x4x1_16x16x64_default,126.15,1055.95,0.0 +80,128,7168,2048,torch.float8_e4m3fnuz,23,0,20.7357,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,181.24,809.1,0.0 +80,256,7168,2048,torch.float8_e4m3fnuz,18,0,31.9102,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,235.54,591.48,0.0 +80,512,7168,2048,torch.float8_e4m3fnuz,48,0,54.7699,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,274.46,421.19,0.0 +80,1024,7168,2048,torch.float8_e4m3fnuz,48,0,102.267,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,293.98,307.6,0.0 +80,1536,7168,2048,torch.float8_e4m3fnuz,18,0,142.7196,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,315.98,279.19,0.0 +80,2048,7168,2048,torch.float8_e4m3fnuz,18,0,192.4732,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,312.4,250.6,0.0 +80,4096,7168,2048,torch.float8_e4m3fnuz,41,0,358.4362,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,335.51,228.18,0.0 +80,8192,7168,2048,torch.float8_e4m3fnuz,41,0,685.4349,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,350.9,217.23,0.0 +80,16384,7168,2048,torch.float8_e4m3fnuz,41,0,1352.1887,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,355.75,209.38,0.0 +80,20480,7168,2048,torch.float8_e4m3fnuz,41,0,1690.7436,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,355.64,207.14,0.0 +80,16,4608,7168,torch.float8_e4m3fnuz,2,0,13.3851,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,78.97,2487.26,0.0 +80,32,4608,7168,torch.float8_e4m3fnuz,9,0,15.7244,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,134.44,2133.91,0.0 +80,64,4608,7168,torch.float8_e4m3fnuz,23,0,24.1164,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,175.31,1413.09,0.0 +80,128,4608,7168,torch.float8_e4m3fnuz,24,0,36.1367,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,233.99,972.07,0.0 +80,256,4608,7168,torch.float8_e4m3fnuz,24,0,59.0996,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,286.15,629.86,0.0 +80,512,4608,7168,torch.float8_e4m3fnuz,0,0,104.9786,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x128_1x4x1_16x16x64_default,322.19,394.54,0.0 +80,1024,4608,7168,torch.float8_e4m3fnuz,26,0,187.2691,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x128_1x4x1_16x16x64_default,361.22,265.97,0.0 +80,1536,4608,7168,torch.float8_e4m3fnuz,46,0,278.8125,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,363.93,208.73,0.0 +80,2048,4608,7168,torch.float8_e4m3fnuz,54,0,358.9201,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,376.94,185.51,0.0 +80,4096,4608,7168,torch.float8_e4m3fnuz,54,0,703.9066,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,384.4,142.26,0.0 +80,8192,4608,7168,torch.float8_e4m3fnuz,54,0,1358.216,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,398.44,123.14,0.0 +80,16384,4608,7168,torch.float8_e4m3fnuz,54,0,2673.4661,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,404.84,112.76,0.0 +80,20480,4608,7168,torch.float8_e4m3fnuz,54,0,3349.1641,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,403.96,110.05,0.0 +80,16,7168,2304,torch.float8_e4m3fnuz,33,0,10.2988,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x256_1x4x1_16x16x64_default,51.31,1629.44,0.0 +80,32,7168,2304,torch.float8_e4m3fnuz,21,0,11.7542,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,89.92,1450.34,0.0 +80,64,7168,2304,torch.float8_e4m3fnuz,21,0,15.5548,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,135.9,1130.2,0.0 +80,128,7168,2304,torch.float8_e4m3fnuz,16,0,22.3182,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x128_1x4x1_16x16x64_default,189.44,835.42,0.0 +80,256,7168,2304,torch.float8_e4m3fnuz,46,0,33.7458,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,250.57,615.63,0.0 +80,512,7168,2304,torch.float8_e4m3fnuz,48,0,59.6275,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,283.62,419.85,0.0 +80,1024,7168,2304,torch.float8_e4m3fnuz,41,0,109.4005,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,309.17,306.71,0.0 +80,1536,7168,2304,torch.float8_e4m3fnuz,18,0,152.2543,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,333.22,276.34,0.0 +80,2048,7168,2304,torch.float8_e4m3fnuz,18,0,203.1162,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,333.04,249.09,0.0 +80,4096,7168,2304,torch.float8_e4m3fnuz,41,0,382.4726,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,353.73,221.38,0.0 +80,8192,7168,2304,torch.float8_e4m3fnuz,41,0,741.673,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,364.83,206.06,0.0 +80,16384,7168,2304,torch.float8_e4m3fnuz,41,0,1464.3626,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,369.56,197.45,0.0 +80,20480,7168,2304,torch.float8_e4m3fnuz,41,0,1814.2843,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,372.85,196.94,0.0 +80,16,512,7168,torch.float8_e4m3fnuz,2,0,12.5374,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,9.37,303.18,0.0 +80,32,512,7168,torch.float8_e4m3fnuz,2,0,12.4268,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,18.9,316.43,0.0 +80,64,512,7168,torch.float8_e4m3fnuz,30,0,12.506,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,37.56,335.38,0.0 +80,128,512,7168,torch.float8_e4m3fnuz,2,0,12.6386,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,74.34,373.35,0.0 +80,256,512,7168,torch.float8_e4m3fnuz,9,0,15.4559,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,121.57,373.14,0.0 +80,512,512,7168,torch.float8_e4m3fnuz,9,0,22.6732,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,165.75,346.86,0.0 +80,1024,512,7168,torch.float8_e4m3fnuz,24,0,35.1171,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,214.03,343.38,0.0 +80,1536,512,7168,torch.float8_e4m3fnuz,99,0,48.688,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x128x256_1x4x1_16x16x64_default,231.56,333.82,0.0 +80,2048,512,7168,torch.float8_e4m3fnuz,24,0,55.6177,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,270.28,367.64,0.0 +80,4096,512,7168,torch.float8_e4m3fnuz,18,0,103.2737,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,291.12,360.44,0.0 +80,8192,512,7168,torch.float8_e4m3fnuz,24,0,183.9884,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,326.81,384.69,0.0 +80,16384,512,7168,torch.float8_e4m3fnuz,24,0,339.0609,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,354.68,406.68,0.0 +80,20480,512,7168,torch.float8_e4m3fnuz,68,0,397.1766,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x256_1x4x1_16x16x64_default,378.48,431.65,0.0 +80,16,4096,512,torch.float8_e4m3fnuz,33,0,4.0949,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x256_1x4x1_16x16x64_default,16.39,546.15,0.0 +80,32,4096,512,torch.float8_e4m3fnuz,21,0,4.7315,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,28.37,502.1,0.0 +80,64,4096,512,torch.float8_e4m3fnuz,49,0,5.8289,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,46.05,455.35,0.0 +80,128,4096,512,torch.float8_e4m3fnuz,49,0,7.7762,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,69.04,412.96,0.0 +80,256,4096,512,torch.float8_e4m3fnuz,23,0,11.612,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,92.47,372.49,0.0 +80,512,4096,512,torch.float8_e4m3fnuz,45,0,17.9873,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,119.39,364.35,0.0 +80,1024,4096,512,torch.float8_e4m3fnuz,18,0,28.8943,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,148.64,381.05,0.0 +80,1536,4096,512,torch.float8_e4m3fnuz,18,0,38.5091,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x128_1x4x1_16x16x64_default,167.3,401.63,0.0 +80,2048,4096,512,torch.float8_e4m3fnuz,46,0,49.318,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,174.17,403.97,0.0 +80,4096,4096,512,torch.float8_e4m3fnuz,41,0,86.521,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,198.56,436.3,0.0 +80,8192,4096,512,torch.float8_e4m3fnuz,41,0,159.4617,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,215.47,460.3,0.0 +80,16384,4096,512,torch.float8_e4m3fnuz,41,0,304.593,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,225.61,475.07,0.0 +80,20480,4096,512,torch.float8_e4m3fnuz,41,0,375.3411,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,228.86,480.51,0.0 +80,16,7168,256,torch.float8_e4m3fnuz,5,0,4.0505,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x256_1x4x1_16x16x64_default,14.5,510.67,0.0 +80,32,7168,256,torch.float8_e4m3fnuz,49,0,4.9536,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,23.71,464.7,0.0 +80,64,7168,256,torch.float8_e4m3fnuz,22,0,6.3078,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x256_1x4x1_16x16x64_default,37.24,438.96,0.0 +80,128,7168,256,torch.float8_e4m3fnuz,17,0,8.5847,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x128_1x4x1_16x16x64_default,54.72,431.32,0.0 +80,256,7168,256,torch.float8_e4m3fnuz,46,0,12.3108,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x128x128_1x4x1_16x16x64_default,76.32,452.49,0.0 +80,512,7168,256,torch.float8_e4m3fnuz,10,0,20.0727,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x256x64_1x4x1_16x16x64_default,93.61,463.62,0.0 +80,1024,7168,256,torch.float8_e4m3fnuz,41,0,34.008,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,110.51,493.33,0.0 +80,1536,7168,256,torch.float8_e4m3fnuz,45,0,47.0605,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,119.79,515.26,0.0 +80,2048,7168,256,torch.float8_e4m3fnuz,41,0,59.2228,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,126.91,535.59,0.0 +80,4096,7168,256,torch.float8_e4m3fnuz,41,0,105.2494,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,142.83,585.31,0.0 +80,8192,7168,256,torch.float8_e4m3fnuz,41,0,197.2828,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,152.39,615.22,0.0 +80,16384,7168,256,torch.float8_e4m3fnuz,41,0,381.565,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,157.59,631.37,0.0 +80,20480,7168,256,torch.float8_e4m3fnuz,41,0,472.001,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,159.24,637.03,0.0 +80,1,4096,512,torch.float8_e4m3fnuz,5,0,4.0057,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x256_1x4x1_16x16x64_default,1.05,525.71,0.0 +80,1,2112,7168,torch.float8_e4m3fnuz,2,0,12.3092,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,2.46,1230.8,0.0 +80,1,4608,7168,torch.float8_e4m3fnuz,2,0,12.6122,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,5.24,2620.2,0.0 +80,1,7168,2304,torch.float8_e4m3fnuz,5,0,10.0275,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x256_1x4x1_16x16x64_default,3.29,1648.64,0.0 +80,1,512,7168,torch.float8_e4m3fnuz,30,0,12.268,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,0.6,299.82,0.0 +80,1,7168,256,torch.float8_e4m3fnuz,5,0,4.1686,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x256_1x4x1_16x16x64_default,0.88,443.7,0.0 +80,16,2112,7168,torch.float8_e4m3fnuz,2,0,12.2697,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,39.48,1248.69,0.0 +80,32,2112,7168,torch.float8_e4m3fnuz,2,0,12.5735,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,77.06,1233.02,0.0 +80,48,4096,512,torch.float8_e4m3fnuz,33,0,5.6993,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x256_1x4x1_16x16x64_default,35.32,441.27,0.0 +80,48,2112,7168,torch.float8_e4m3fnuz,9,0,15.4811,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,93.88,1013.21,0.0 +80,48,4608,7168,torch.float8_e4m3fnuz,23,0,23.3699,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,135.68,1447.01,0.0 +80,48,7168,2304,torch.float8_e4m3fnuz,49,0,15.3526,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x256_1x4x1_16x16x64_default,103.27,1127.74,0.0 +80,48,512,7168,torch.float8_e4m3fnuz,2,0,12.4611,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,28.27,326.07,0.0 +80,48,7168,256,torch.float8_e4m3fnuz,50,0,6.0534,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x64x256_1x4x1_16x16x64_default,29.1,418.84,0.0 +80,64,2112,7168,torch.float8_e4m3fnuz,9,0,15.7138,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,123.32,1009.81,0.0 +80,80,4096,512,torch.float8_e4m3fnuz,30,0,6.5582,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,51.16,425.95,0.0 +80,80,2112,7168,torch.float8_e4m3fnuz,37,0,22.7948,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,106.26,704.12,0.0 +80,80,4608,7168,torch.float8_e4m3fnuz,57,0,29.8014,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x192x256_1x4x1_16x16x64_default,177.33,1152.32,0.0 +80,80,7168,2304,torch.float8_e4m3fnuz,45,0,18.9166,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x128_1x4x1_16x16x64_default,139.69,943.42,0.0 +80,80,512,7168,torch.float8_e4m3fnuz,30,0,12.5603,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x64_default,46.75,344.37,0.0 +80,80,7168,256,torch.float8_e4m3fnuz,6,0,7.3721,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x128x256_1x4x1_16x16x64_default,39.83,407.26,0.0 +80,96,4096,512,torch.float8_e4m3fnuz,21,0,7.2506,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x256_1x4x1_16x16x64_default,55.53,404.48,0.0 +80,96,2112,7168,torch.float8_e4m3fnuz,37,0,23.0909,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,125.88,702.98,0.0 +80,96,4608,7168,torch.float8_e4m3fnuz,66,0,31.2288,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x192x256_1x4x1_16x16x64_default,203.07,1108.05,0.0 +80,96,7168,2304,torch.float8_e4m3fnuz,17,0,19.3679,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x128_1x4x1_16x16x64_default,163.72,935.18,0.0 +80,96,512,7168,torch.float8_e4m3fnuz,2,0,12.4871,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,56.43,356.88,0.0 +80,96,7168,256,torch.float8_e4m3fnuz,23,0,8.0488,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,43.77,402.03,0.0 +80,112,4096,512,torch.float8_e4m3fnuz,51,0,7.9797,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x128x256_1x4x1_16x16x64_default,58.87,384.98,0.0 +80,112,2112,7168,torch.float8_e4m3fnuz,9,0,22.8662,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,148.3,717.86,0.0 +80,112,4608,7168,torch.float8_e4m3fnuz,24,0,35.9043,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x64_default,206.07,971.06,0.0 +80,112,7168,2304,torch.float8_e4m3fnuz,23,0,22.119,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x128x256_1x4x1_16x16x64_default,167.25,830.9,0.0 +80,112,512,7168,torch.float8_e4m3fnuz,2,0,12.7783,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x64_default,64.33,359.01,0.0 +80,112,7168,256,torch.float8_e4m3fnuz,34,0,7.9613,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x128x256_1x4x1_16x16x64_default,51.63,435.77,0.0 +80,128,2112,7168,torch.float8_e4m3fnuz,37,0,22.8679,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x64_default,169.47,725.78,0.0 +80,256,2112,7168,torch.float8_e4m3fnuz,50,0,36.4203,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x64x256_1x4x1_16x16x64_default,212.82,495.74,0.0 +80,512,2112,7168,torch.float8_e4m3fnuz,98,0,67.0873,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x192x256_1x4x1_16x16x64_default,231.07,312.6,0.0 +80,1024,2112,7168,torch.float8_e4m3fnuz,57,0,109.5452,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x192x256_1x4x1_16x16x64_default,283.03,244.69,0.0 +80,1536,2112,7168,torch.float8_e4m3fnuz,54,0,147.1439,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,316.06,221.8,0.0 +80,2048,2112,7168,torch.float8_e4m3fnuz,26,0,184.2233,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x192x128_1x4x1_16x16x64_default,336.59,208.82,0.0 +80,4096,2112,7168,torch.float8_e4m3fnuz,54,0,325.7229,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,380.74,189.73,0.0 +80,8192,2112,7168,torch.float8_e4m3fnuz,54,0,632.8361,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,391.94,171.39,0.0 +80,16384,2112,7168,torch.float8_e4m3fnuz,54,0,1251.1568,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,396.49,161.28,0.0 +80,32768,4096,512,torch.float8_e4m3fnuz,41,0,592.8636,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,231.82,484.61,0.0 +80,32768,2112,7168,torch.float8_e4m3fnuz,54,0,2480.8163,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,399.92,156.57,0.0 +80,32768,4608,7168,torch.float8_e4m3fnuz,54,0,5335.9575,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x192x128_1x4x1_16x16x64_default,405.67,106.8,0.0 +80,32768,7168,2304,torch.float8_e4m3fnuz,41,0,2910.7761,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,371.84,193.0,0.0 +80,32768,512,7168,torch.float8_e4m3fnuz,48,0,634.4121,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_64x256x128_1x4x1_16x16x64_default,379.12,428.91,0.0 +80,32768,7168,256,torch.float8_e4m3fnuz,41,0,747.496,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x64_1x4x1_16x16x64_default,160.88,642.12,0.0 diff --git a/aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv b/aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv old mode 100755 new mode 100644 index e97c52db23..9054e3064c --- a/aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv +++ b/aiter/configs/a8w8_bpreshuffle_cktile_untuned_gemm.csv @@ -1,244 +1,221 @@ -M,N,K -1,9216,4096 -2,9216,4096 -4,9216,4096 -8,9216,4096 -16,9216,4096 -32,9216,4096 -64,9216,4096 -128,9216,4096 -256,9216,4096 -1024,9216,4096 -2048,9216,4096 -4096,9216,4096 -4240,9216,4096 -16384,9216,4096 -32768,9216,4096 -1,4608,4096 -2,4608,4096 -4,4608,4096 -8,4608,4096 -16,4608,4096 -32,4608,4096 -64,4608,4096 -128,4608,4096 -256,4608,4096 -1024,4608,4096 -2048,4608,4096 -4096,4608,4096 -16384,4608,4096 -32768,4608,4096 -1,1280,8192 -32,1280,8192 -64,1280,8192 -128,1280,8192 -192,1280,8192 -256,1280,8192 -320,1280,8192 -512,1280,8192 -1024,1280,8192 -2048,1280,8192 -4096,1280,8192 -8192,1280,8192 -16384,1280,8192 -1,8192,1024 -32,8192,1024 -64,8192,1024 -128,8192,1024 -192,8192,1024 -256,8192,1024 -320,8192,1024 -512,8192,1024 -1024,8192,1024 -2048,8192,1024 -4096,8192,1024 -8192,8192,1024 -16384,8192,1024 -16,1536,7168 -32,1536,7168 -64,1536,7168 -128,1536,7168 -256,1536,7168 -512,1536,7168 -1024,1536,7168 -1536,1536,7168 -2048,1536,7168 -4096,1536,7168 -8192,1536,7168 -16384,1536,7168 -20480,1536,7168 -16,3072,1536 -32,3072,1536 -64,3072,1536 -128,3072,1536 -256,3072,1536 -512,3072,1536 -1024,3072,1536 -1536,3072,1536 -2048,3072,1536 -4096,3072,1536 -8192,3072,1536 -16384,3072,1536 -20480,3072,1536 -16,576,7168 -32,576,7168 -64,576,7168 -128,576,7168 -256,576,7168 -512,576,7168 -1024,576,7168 -1536,576,7168 -2048,576,7168 -4096,576,7168 -8192,576,7168 -16384,576,7168 -20480,576,7168 -16,7168,2048 -32,7168,2048 -64,7168,2048 -128,7168,2048 -256,7168,2048 -512,7168,2048 -1024,7168,2048 -1536,7168,2048 -2048,7168,2048 -4096,7168,2048 -8192,7168,2048 -16384,7168,2048 -20480,7168,2048 -16,4608,7168 -32,4608,7168 -64,4608,7168 -128,4608,7168 -256,4608,7168 -512,4608,7168 -1024,4608,7168 -1536,4608,7168 -2048,4608,7168 -4096,4608,7168 -8192,4608,7168 -16384,4608,7168 -20480,4608,7168 -16,7168,2304 -32,7168,2304 -64,7168,2304 -128,7168,2304 -256,7168,2304 -512,7168,2304 -1024,7168,2304 -1536,7168,2304 -2048,7168,2304 -4096,7168,2304 -8192,7168,2304 -16384,7168,2304 -20480,7168,2304 -16,512,7168 -32,512,7168 -64,512,7168 -128,512,7168 -256,512,7168 -512,512,7168 -1024,512,7168 -1536,512,7168 -2048,512,7168 -4096,512,7168 -8192,512,7168 -16384,512,7168 -20480,512,7168 -16,4096,512 -32,4096,512 -64,4096,512 -128,4096,512 -256,4096,512 -512,4096,512 -1024,4096,512 -1536,4096,512 -2048,4096,512 -4096,4096,512 -8192,4096,512 -16384,4096,512 -20480,4096,512 -16,7168,256 -32,7168,256 -64,7168,256 -128,7168,256 -256,7168,256 -512,7168,256 -1024,7168,256 -1536,7168,256 -2048,7168,256 -4096,7168,256 -8192,7168,256 -16384,7168,256 -20480,7168,256 -32, 7168, 1536 -32, 7168, 576 -32, 2048, 7168 -32, 7168, 512 -32, 256, 7168 -64, 7168, 1536 -64, 7168, 576 -64, 2048, 7168 -64, 7168, 512 -64, 256, 7168 -96, 7168, 1536 -96, 7168, 576 -96, 2048, 7168 -96, 7168, 512 -96, 256, 7168 -128, 7168, 1536 -128, 7168, 576 -128, 2048, 7168 -128, 7168, 512 -128, 256, 7168 -256, 7168, 1536 -256, 7168, 576 -256, 2048, 7168 -256, 7168, 512 -256, 256, 7168 -512, 7168, 1536 -512, 7168, 576 -512, 2048, 7168 -512, 7168, 512 -512, 256, 7168 -1024, 7168, 1536 -1024, 7168, 576 -1024, 2048, 7168 -1024, 7168, 512 -1024, 256, 7168 -2048, 7168, 1536 -2048, 7168, 576 -2048, 2048, 7168 -2048, 7168, 512 -2048, 256, 7168 -4096, 7168, 1536 -4096, 7168, 576 -4096, 2048, 7168 -4096, 7168, 512 -4096, 256, 7168 -8192, 7168, 1536 -8192, 7168, 576 -8192, 2048, 7168 -8192, 7168, 512 -8192, 256, 7168 -16384, 7168, 1536 -16384, 7168, 576 -16384, 2048, 7168 -16384, 7168, 512 -16384, 256, 7168 -5112,6912,5120 -5104,5120,8192 -2048,4096,5120 -5120,5120,4096 -5120,5120,8192 -32, 2112, 7168 -64, 2112, 7168 -96, 2112, 7168 -128, 2112, 7168 -256, 2112, 7168 -512, 2112, 7168 -1024, 2112, 7168 -2048, 2112, 7168 -4096, 2112, 7168 -8192, 2112, 7168 -16384, 2112, 7168 +M,N,K,q_dtype_w +1,9216,4096,torch.float8_e4m3fnuz +2,9216,4096,torch.float8_e4m3fnuz +4,9216,4096,torch.float8_e4m3fnuz +8,9216,4096,torch.float8_e4m3fnuz +16,9216,4096,torch.float8_e4m3fnuz +32,9216,4096,torch.float8_e4m3fnuz +64,9216,4096,torch.float8_e4m3fnuz +128,9216,4096,torch.float8_e4m3fnuz +256,9216,4096,torch.float8_e4m3fnuz +1024,9216,4096,torch.float8_e4m3fnuz +2048,9216,4096,torch.float8_e4m3fnuz +4096,9216,4096,torch.float8_e4m3fnuz +4240,9216,4096,torch.float8_e4m3fnuz +16384,9216,4096,torch.float8_e4m3fnuz +32768,9216,4096,torch.float8_e4m3fnuz +1,4608,4096,torch.float8_e4m3fnuz +2,4608,4096,torch.float8_e4m3fnuz +4,4608,4096,torch.float8_e4m3fnuz +8,4608,4096,torch.float8_e4m3fnuz +16,4608,4096,torch.float8_e4m3fnuz +32,4608,4096,torch.float8_e4m3fnuz +64,4608,4096,torch.float8_e4m3fnuz +128,4608,4096,torch.float8_e4m3fnuz +256,4608,4096,torch.float8_e4m3fnuz +1024,4608,4096,torch.float8_e4m3fnuz +2048,4608,4096,torch.float8_e4m3fnuz +4096,4608,4096,torch.float8_e4m3fnuz +16384,4608,4096,torch.float8_e4m3fnuz +32768,4608,4096,torch.float8_e4m3fnuz +1,1280,8192,torch.float8_e4m3fnuz +32,1280,8192,torch.float8_e4m3fnuz +64,1280,8192,torch.float8_e4m3fnuz +128,1280,8192,torch.float8_e4m3fnuz +192,1280,8192,torch.float8_e4m3fnuz +256,1280,8192,torch.float8_e4m3fnuz +320,1280,8192,torch.float8_e4m3fnuz +512,1280,8192,torch.float8_e4m3fnuz +1024,1280,8192,torch.float8_e4m3fnuz +2048,1280,8192,torch.float8_e4m3fnuz +4096,1280,8192,torch.float8_e4m3fnuz +8192,1280,8192,torch.float8_e4m3fnuz +16384,1280,8192,torch.float8_e4m3fnuz +1,8192,1024,torch.float8_e4m3fnuz +32,8192,1024,torch.float8_e4m3fnuz +64,8192,1024,torch.float8_e4m3fnuz +128,8192,1024,torch.float8_e4m3fnuz +192,8192,1024,torch.float8_e4m3fnuz +256,8192,1024,torch.float8_e4m3fnuz +320,8192,1024,torch.float8_e4m3fnuz +512,8192,1024,torch.float8_e4m3fnuz +1024,8192,1024,torch.float8_e4m3fnuz +2048,8192,1024,torch.float8_e4m3fnuz +4096,8192,1024,torch.float8_e4m3fnuz +8192,8192,1024,torch.float8_e4m3fnuz +16384,8192,1024,torch.float8_e4m3fnuz +16,1536,7168,torch.float8_e4m3fnuz +32,1536,7168,torch.float8_e4m3fnuz +64,1536,7168,torch.float8_e4m3fnuz +128,1536,7168,torch.float8_e4m3fnuz +256,1536,7168,torch.float8_e4m3fnuz +512,1536,7168,torch.float8_e4m3fnuz +1024,1536,7168,torch.float8_e4m3fnuz +1536,1536,7168,torch.float8_e4m3fnuz +2048,1536,7168,torch.float8_e4m3fnuz +4096,1536,7168,torch.float8_e4m3fnuz +8192,1536,7168,torch.float8_e4m3fnuz +16384,1536,7168,torch.float8_e4m3fnuz +20480,1536,7168,torch.float8_e4m3fnuz +16,3072,1536,torch.float8_e4m3fnuz +32,3072,1536,torch.float8_e4m3fnuz +64,3072,1536,torch.float8_e4m3fnuz +128,3072,1536,torch.float8_e4m3fnuz +256,3072,1536,torch.float8_e4m3fnuz +512,3072,1536,torch.float8_e4m3fnuz +1024,3072,1536,torch.float8_e4m3fnuz +1536,3072,1536,torch.float8_e4m3fnuz +2048,3072,1536,torch.float8_e4m3fnuz +4096,3072,1536,torch.float8_e4m3fnuz +8192,3072,1536,torch.float8_e4m3fnuz +16384,3072,1536,torch.float8_e4m3fnuz +20480,3072,1536,torch.float8_e4m3fnuz +16,576,7168,torch.float8_e4m3fnuz +32,576,7168,torch.float8_e4m3fnuz +64,576,7168,torch.float8_e4m3fnuz +128,576,7168,torch.float8_e4m3fnuz +256,576,7168,torch.float8_e4m3fnuz +512,576,7168,torch.float8_e4m3fnuz +1024,576,7168,torch.float8_e4m3fnuz +1536,576,7168,torch.float8_e4m3fnuz +2048,576,7168,torch.float8_e4m3fnuz +4096,576,7168,torch.float8_e4m3fnuz +8192,576,7168,torch.float8_e4m3fnuz +16384,576,7168,torch.float8_e4m3fnuz +20480,576,7168,torch.float8_e4m3fnuz +16,7168,2048,torch.float8_e4m3fnuz +32,7168,2048,torch.float8_e4m3fnuz +64,7168,2048,torch.float8_e4m3fnuz +128,7168,2048,torch.float8_e4m3fnuz +256,7168,2048,torch.float8_e4m3fnuz +512,7168,2048,torch.float8_e4m3fnuz +1024,7168,2048,torch.float8_e4m3fnuz +1536,7168,2048,torch.float8_e4m3fnuz +2048,7168,2048,torch.float8_e4m3fnuz +4096,7168,2048,torch.float8_e4m3fnuz +8192,7168,2048,torch.float8_e4m3fnuz +16384,7168,2048,torch.float8_e4m3fnuz +20480,7168,2048,torch.float8_e4m3fnuz +16,4608,7168,torch.float8_e4m3fnuz +32,4608,7168,torch.float8_e4m3fnuz +64,4608,7168,torch.float8_e4m3fnuz +128,4608,7168,torch.float8_e4m3fnuz +256,4608,7168,torch.float8_e4m3fnuz +512,4608,7168,torch.float8_e4m3fnuz +1024,4608,7168,torch.float8_e4m3fnuz +1536,4608,7168,torch.float8_e4m3fnuz +2048,4608,7168,torch.float8_e4m3fnuz +4096,4608,7168,torch.float8_e4m3fnuz +8192,4608,7168,torch.float8_e4m3fnuz +16384,4608,7168,torch.float8_e4m3fnuz +20480,4608,7168,torch.float8_e4m3fnuz +16,7168,2304,torch.float8_e4m3fnuz +32,7168,2304,torch.float8_e4m3fnuz +64,7168,2304,torch.float8_e4m3fnuz +128,7168,2304,torch.float8_e4m3fnuz +256,7168,2304,torch.float8_e4m3fnuz +512,7168,2304,torch.float8_e4m3fnuz +1024,7168,2304,torch.float8_e4m3fnuz +1536,7168,2304,torch.float8_e4m3fnuz +2048,7168,2304,torch.float8_e4m3fnuz +4096,7168,2304,torch.float8_e4m3fnuz +8192,7168,2304,torch.float8_e4m3fnuz +16384,7168,2304,torch.float8_e4m3fnuz +20480,7168,2304,torch.float8_e4m3fnuz +16,512,7168,torch.float8_e4m3fnuz +32,512,7168,torch.float8_e4m3fnuz +64,512,7168,torch.float8_e4m3fnuz +128,512,7168,torch.float8_e4m3fnuz +256,512,7168,torch.float8_e4m3fnuz +512,512,7168,torch.float8_e4m3fnuz +1024,512,7168,torch.float8_e4m3fnuz +1536,512,7168,torch.float8_e4m3fnuz +2048,512,7168,torch.float8_e4m3fnuz +4096,512,7168,torch.float8_e4m3fnuz +8192,512,7168,torch.float8_e4m3fnuz +16384,512,7168,torch.float8_e4m3fnuz +20480,512,7168,torch.float8_e4m3fnuz +16,4096,512,torch.float8_e4m3fnuz +32,4096,512,torch.float8_e4m3fnuz +64,4096,512,torch.float8_e4m3fnuz +128,4096,512,torch.float8_e4m3fnuz +256,4096,512,torch.float8_e4m3fnuz +512,4096,512,torch.float8_e4m3fnuz +1024,4096,512,torch.float8_e4m3fnuz +1536,4096,512,torch.float8_e4m3fnuz +2048,4096,512,torch.float8_e4m3fnuz +4096,4096,512,torch.float8_e4m3fnuz +8192,4096,512,torch.float8_e4m3fnuz +16384,4096,512,torch.float8_e4m3fnuz +20480,4096,512,torch.float8_e4m3fnuz +16,7168,256,torch.float8_e4m3fnuz +32,7168,256,torch.float8_e4m3fnuz +64,7168,256,torch.float8_e4m3fnuz +128,7168,256,torch.float8_e4m3fnuz +256,7168,256,torch.float8_e4m3fnuz +512,7168,256,torch.float8_e4m3fnuz +1024,7168,256,torch.float8_e4m3fnuz +1536,7168,256,torch.float8_e4m3fnuz +2048,7168,256,torch.float8_e4m3fnuz +4096,7168,256,torch.float8_e4m3fnuz +8192,7168,256,torch.float8_e4m3fnuz +16384,7168,256,torch.float8_e4m3fnuz +20480,7168,256,torch.float8_e4m3fnuz +1,4096,512,torch.float8_e4m3fnuz +1,2112,7168,torch.float8_e4m3fnuz +1,4608,7168,torch.float8_e4m3fnuz +1,7168,2304,torch.float8_e4m3fnuz +1,512,7168,torch.float8_e4m3fnuz +1,7168,256,torch.float8_e4m3fnuz +16,2112,7168,torch.float8_e4m3fnuz +32,2112,7168,torch.float8_e4m3fnuz +48,4096,512,torch.float8_e4m3fnuz +48,2112,7168,torch.float8_e4m3fnuz +48,4608,7168,torch.float8_e4m3fnuz +48,7168,2304,torch.float8_e4m3fnuz +48,512,7168,torch.float8_e4m3fnuz +48,7168,256,torch.float8_e4m3fnuz +64,2112,7168,torch.float8_e4m3fnuz +80,4096,512,torch.float8_e4m3fnuz +80,2112,7168,torch.float8_e4m3fnuz +80,4608,7168,torch.float8_e4m3fnuz +80,7168,2304,torch.float8_e4m3fnuz +80,512,7168,torch.float8_e4m3fnuz +80,7168,256,torch.float8_e4m3fnuz +96,4096,512,torch.float8_e4m3fnuz +96,2112,7168,torch.float8_e4m3fnuz +96,4608,7168,torch.float8_e4m3fnuz +96,7168,2304,torch.float8_e4m3fnuz +96,512,7168,torch.float8_e4m3fnuz +96,7168,256,torch.float8_e4m3fnuz +112,4096,512,torch.float8_e4m3fnuz +112,2112,7168,torch.float8_e4m3fnuz +112,4608,7168,torch.float8_e4m3fnuz +112,7168,2304,torch.float8_e4m3fnuz +112,512,7168,torch.float8_e4m3fnuz +112,7168,256,torch.float8_e4m3fnuz +128,2112,7168,torch.float8_e4m3fnuz +256,2112,7168,torch.float8_e4m3fnuz +512,2112,7168,torch.float8_e4m3fnuz +1024,2112,7168,torch.float8_e4m3fnuz +1536,2112,7168,torch.float8_e4m3fnuz +2048,2112,7168,torch.float8_e4m3fnuz +4096,2112,7168,torch.float8_e4m3fnuz +8192,2112,7168,torch.float8_e4m3fnuz +16384,2112,7168,torch.float8_e4m3fnuz +32768,4096,512,torch.float8_e4m3fnuz +32768,2112,7168,torch.float8_e4m3fnuz +32768,4608,7168,torch.float8_e4m3fnuz +32768,7168,2304,torch.float8_e4m3fnuz +32768,512,7168,torch.float8_e4m3fnuz +32768,7168,256,torch.float8_e4m3fnuz diff --git a/aiter/ops/gemm_op_a8w8.py b/aiter/ops/gemm_op_a8w8.py index de8ab92369..a5abfece9d 100644 --- a/aiter/ops/gemm_op_a8w8.py +++ b/aiter/ops/gemm_op_a8w8.py @@ -431,9 +431,6 @@ def gemm_a8w8_bpreshuffle( n = WQ.shape[0] k = XQ.shape[-1] - get_bpreshuffle_GEMM_config( - m, n, k, dtypes.fp8, AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE - ) # if ( # ck_config is None # and dtype == dtypes.bf16 @@ -446,7 +443,40 @@ def gemm_a8w8_bpreshuffle( assert WQ.dtype == dtypes.fp8, "gemm_a8w8_bpreshuffle only support fp8 now" assert bias is None, "gemm_a8w8_bpreshuffle does not support bias now" Y = torch.empty(m, n, dtype=dtype, device=XQ.device) - return gemm_a8w8_bpreshuffle_ck(XQ, WQ, x_scale, w_scale, Y) + + # CKTile only supports bf16 dtype + if dtype == dtypes.bf16: + cktile_config = get_bpreshuffle_GEMM_config( + m, n, k, dtypes.fp8, AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE_FILE + ) + else: + cktile_config = None + + ck_config = get_bpreshuffle_GEMM_config( + m, n, k, dtypes.fp8, AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE + ) + if cktile_config is not None and ck_config is not None: + cktile_time = cktile_config.get("time", float("inf")) + ck_time = ck_config.get("time", float("inf")) + + if AITER_LOG_TUNED_CONFIG: + logger.info( + f"Both CKTile and CK configs found for M:{m}, N:{n}, K:{k} - " + f"CKTile time: {cktile_time:.6f}ms, CK time: {ck_time:.6f}ms" + ) + + if cktile_time <= ck_time: + if AITER_LOG_TUNED_CONFIG: + logger.info(f"Using CKTile implementation (faster)") + return gemm_a8w8_bpreshuffle_cktile(XQ, WQ, x_scale, w_scale, Y) + else: + if AITER_LOG_TUNED_CONFIG: + logger.info(f"Using CK implementation (faster)") + return gemm_a8w8_bpreshuffle_ck(XQ, WQ, x_scale, w_scale, Y) + else: + if AITER_LOG_TUNED_CONFIG: + logger.info(f"default Using CK implementation") + return gemm_a8w8_bpreshuffle_ck(XQ, WQ, x_scale, w_scale, Y) def gemm_a8w8_blockscale_fake( @@ -639,33 +669,6 @@ def gemm_a8w8_blockscale_bpreshuffle_tune( ) -> torch.Tensor: ... -def gemm_a8w8_cktile_bpreshuffle( - XQ: Tensor, - WQ: Tensor, - x_scale: Tensor, - w_scale: Tensor, - bias: Optional[Tensor] = None, - dtype=torch.float16, - check=False, -): - assert dtype in [ - torch.bfloat16, - torch.float16, - ], f"Output {dtype=} is currently not supported in gemm_a8w8_cktile_bpreshuffle" - m = XQ.shape[0] - n = WQ.shape[0] - k = XQ.shape[-1] - - # get_bpreshuffle_GEMM_config( - # m, n, k, dtypes.fp8, AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE - # ) - get_CKGEMM_config(m, n, k, AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_CKTILE_FILE) - assert WQ.dtype == dtypes.fp8, "gemm_a8w8_cktile_bpreshuffle only support fp8 now" - assert bias is None, "gemm_a8w8_cktile_bpreshuffle does not support bias now" - Y = torch.empty(m, n, dtype=dtype, device=XQ.device) - return gemm_a8w8_bpreshuffle_cktile(XQ, WQ, x_scale, w_scale, Y) - - @compile_ops( "module_gemm_a8w8_bpreshuffle_cktile_tune", fc_name="gemm_a8w8_bpreshuffle_cktile_tune", diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/README.md b/csrc/cktile_gemm_a8w8_bpreshuffle/README.md index bba94625eb..afe6d25d13 100644 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/README.md +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/README.md @@ -15,4 +15,4 @@ You can find the results of the tuning in `aiter/configs/a8w8_bpreshuffle_cktile ## More If you want to re-install gemm_a8w8_bpreshuffle_cktile, you should remove `aiter/jit/module_gemm_a8w8_bpreshuffle_cktile.so` and `aiter/jit/build/module_gemm_a8w8_bpreshuffle_cktile` first. -If you use flag `PREBUILD_KERNELS=1 USE_CK_A8W8=1` when you install aiter, it will build gemm a8w8 kernels in `aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv` by default. If you want to use the new result of gemm_a8w8_bpreshuffle_cktile_tune, please remove `build` and `*.so` first, then re-intall aiter after finishing tune. +If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in `aiter/configs/a8w8_bpreshuffle_cktile_tuned_gemm.csv` by default. If you want to use the new result of gemm_a8w8_bpreshuffle_cktile_tune, please remove `build` and `*.so` first, then re-intall aiter after finishing tune. diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu index ac6291d173..b7658841ff 100644 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.cu @@ -76,9 +76,9 @@ torch::Tensor gemm_a8w8_bpreshuffle_cktile_tune(torch::Tensor& XQ, int K = XQ.size(1); int KBatch = std::pow(2, splitK); - if(Y.dtype() == at::ScalarType::Half) + if(Y.dtype() == at::ScalarType::BFloat16) { - rowwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y); + rowwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y); } else { diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py index 5069a7c7bf..177c72f6f5 100755 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile_tune.py @@ -14,7 +14,7 @@ from aiter.utility.mp_tuner import mp_tuner -def run_torch(x, weight, x_scale, w_scale, bias=None, dtype=torch.float16): +def run_torch(x, weight, x_scale, w_scale, bias=None, dtype=torch.bfloat16): x = x.to(dtypes.fp32) * x_scale weight = weight.to(dtypes.fp32) * w_scale out = F.linear(x, weight) @@ -32,12 +32,14 @@ def run_gemm_a8w8_bpreshuffle_cktile( return out -def generate_data(m, n, k, seed, dtype=dtypes.fp16, device="cuda"): +def generate_data( + m, n, k, seed, dtype=dtypes.bf16, q_dtype_w=dtypes.fp8, device="cuda" +): torch.manual_seed(seed) x = torch.randn((m, k), dtype=dtype, device=device) weight = torch.randn((n, k), dtype=dtype, device=device) - x, x_scale = aiter.pertoken_quant(x, quant_dtype=dtypes.fp8) - weight, w_scale = aiter.pertoken_quant(weight, quant_dtype=dtypes.fp8) + x, x_scale = aiter.pertoken_quant(x, quant_dtype=q_dtype_w) + weight, w_scale = aiter.pertoken_quant(weight, quant_dtype=q_dtype_w) bias_f32 = None weight_shuffle = shuffle_weight(weight, layout=(16, 16)) out = torch.empty(m, n, dtype=dtype, device=device) @@ -69,7 +71,12 @@ def get_cktile_gemm_a8w8_bpreshuffle_tune_task( useSplitK, seed, ): - (cu_num, M, N, K) = info_keys + (cu_num, M, N, K, q_dtype_w) = info_keys + if eval(q_dtype_w) != dtypes.fp8: + print( + f"Warning: q_dtype_w only support {dtypes.fp8}, actual q_dtype_w is {q_dtype_w}!" + ) + return [] kernels_num = len(kernels_list) gemm_a8w8_idx = [0, 1, 2, 3, 4] # input index in generate_data ref_data_idx = [0, 5, 2, 3, 6] @@ -94,7 +101,7 @@ def get_cktile_gemm_a8w8_bpreshuffle_tune_task( ( info, generate_data, - (M, N, K, seed, dtypes.fp16), + (M, N, K, seed, dtypes.bf16, eval(q_dtype_w)), run_gemm_a8w8_bpreshuffle_cktile, ( gemm_a8w8_idx, @@ -105,7 +112,7 @@ def get_cktile_gemm_a8w8_bpreshuffle_tune_task( run_torch, ( ref_data_idx, - dtypes.fp16, + dtypes.bf16, ), {}, None, @@ -134,10 +141,11 @@ def tune( M = untunedf.loc[i, "M"] N = untunedf.loc[i, "N"] K = untunedf.loc[i, "K"] + q_dtype_w = untunedf.loc[i, "q_dtype_w"] seed = seed + 1 total_kernel_nums = 0 kernels_num = len(kernels_list) - info_keys = (cu_num, M, N, K) + info_keys = (cu_num, M, N, K, q_dtype_w) task.extend( self.get_cktile_gemm_a8w8_bpreshuffle_tune_task( info_keys, @@ -158,7 +166,7 @@ def tune( if __name__ == "__main__": ## use default key and resultList - key = ["cu_num", "M", "N", "K"] + key = ["cu_num", "M", "N", "K", "q_dtype_w"] tuner = GemmA8W8BpreShuffleCktileTuner( "GemmA8W8BpreShuffleCktileTuner", key=key, diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py b/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py index cbdbe0af15..656a273d80 100755 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py @@ -126,8 +126,8 @@ def gen_instance(self, k: kernelInstance): if self.istune: Path( - os.path.join(self.instances_path, f"{k.name}_dFP32_eFP16.cpp") - ).write_text(INSTANCE_dFP32_eFP16) + os.path.join(self.instances_path, f"{k.name}_dFP32_eBF16.cpp") + ).write_text(INSTANCE_dFP32_eBF16) else: Path( os.path.join(self.instances_path, f"{k.name}_dFP32_eBF16.cpp") diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh index c19fb3e15c..4a039422c2 100644 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh @@ -25,7 +25,7 @@ using F32 = float; using B16 = ck_tile::bf16_t; using ADataType = typename GemmBasicTypeConfig::ADataType; using BDataType = typename GemmBasicTypeConfig::BDataType; -using CDataType = typename GemmBasicTypeConfig::CDataType; +using CDataType = ck_tile::bf16_t; using AccDataType = typename GemmBasicTypeConfig::AccDataType; using ALayout = ck_tile::tensor_layout::gemm::RowMajor; using BLayout = ck_tile::tensor_layout::gemm::ColumnMajor; @@ -330,7 +330,7 @@ gemm_a8w8_bpreshuffle_cktile_impl(torch::Tensor& XQ, TORCH_CHECK(x_scale.dtype() == w_scale.dtype(), "Scales should have the same dtype!"); using ADataType = typename GemmBasicTypeConfig::ADataType; using BDataType = typename GemmBasicTypeConfig::BDataType; - using CDataType = typename GemmBasicTypeConfig::CDataType; + using CDataType = ck_tile::bf16_t;; using AccDataType = typename GemmBasicTypeConfig::AccDataType; using DsDataType = ck_tile::tuple<>; using ALayout = ck_tile::tensor_layout::gemm::RowMajor; diff --git a/op_tests/test_gemm_a8w8.py b/op_tests/test_gemm_a8w8.py index 8f0691e8e0..b7ee6c208a 100755 --- a/op_tests/test_gemm_a8w8.py +++ b/op_tests/test_gemm_a8w8.py @@ -57,11 +57,6 @@ def run_gemm_ck_bpreshuffle(x, weight, x_scale, w_scale, dtype=dtypes.bf16): return aiter.gemm_a8w8_bpreshuffle(x, weight, x_scale, w_scale, None, dtype) -@perftest() -def run_gemm_cktile_bpreshuffle(x, weight, x_scale, w_scale, dtype=dtypes.bf16): - return aiter.gemm_a8w8_cktile_bpreshuffle(x, weight, x_scale, w_scale, None, dtype) - - @perftest() def run_gemm_asm(x, weightshuffle, x_scale, w_scale, bias=None, dtype=dtypes.bf16): return aiter.gemm_a8w8_ASM(x, weightshuffle, x_scale, w_scale, bias) @@ -135,15 +130,6 @@ def test_gemm(dtype, m, n, k, quantDtype=dtypes.i8): else: avg_e = None err_e = None - if quantDtype == dtypes.fp8 and dtype == dtypes.fp16: - f, avg_f = run_gemm_cktile_bpreshuffle( - x, weightshuffle, x_scale, w_scale, dtype - ) - f = f + bias - err_f = checkAllclose(a, f, msg="cktile bpreshuffle: ", rtol=1e-2, atol=1e-2) - else: - avg_f = None - err_f = None return { "ck us": avg_b, "ck err": err_b, @@ -153,8 +139,6 @@ def test_gemm(dtype, m, n, k, quantDtype=dtypes.i8): "asm err": err_d, "hipmm bpreshuffle us": avg_e, "hipmm bpreshuffle err": err_e, - "cktile bpreshuffle us": avg_f, - "cktile bpreshuffle err": err_f, } From ccda771113069fa1af40319a87d612a68115781b Mon Sep 17 00:00:00 2001 From: solin Date: Wed, 19 Nov 2025 02:47:56 +0000 Subject: [PATCH 12/13] refine --- aiter/ops/gemm_op_a8w8.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/aiter/ops/gemm_op_a8w8.py b/aiter/ops/gemm_op_a8w8.py index a5abfece9d..01b019c020 100644 --- a/aiter/ops/gemm_op_a8w8.py +++ b/aiter/ops/gemm_op_a8w8.py @@ -274,13 +274,19 @@ def get_bpreshuffle_GEMM_config( q_dtype_w: torch.dtype, tuned_file=f"{AITER_ROOT_DIR}/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv", ): - if not hasattr(get_bpreshuffle_GEMM_config, "bpreshuffle_gemm_dict"): + # Use dict to cache configs for different files + if not hasattr(get_bpreshuffle_GEMM_config, "file_cache"): + get_bpreshuffle_GEMM_config.file_cache = {} + + # Load file if not cached + if tuned_file not in get_bpreshuffle_GEMM_config.file_cache: asmGemmDictDf = pd.read_csv(tuned_file).drop_duplicates() - get_bpreshuffle_GEMM_config.bpreshuffle_gemm_dict = asmGemmDictDf.set_index( + get_bpreshuffle_GEMM_config.file_cache[tuned_file] = asmGemmDictDf.set_index( ["cu_num", "M", "N", "K", "q_dtype_w"] ).to_dict("index") + cu_num = get_cu_num() - config = get_bpreshuffle_GEMM_config.bpreshuffle_gemm_dict.get( + config = get_bpreshuffle_GEMM_config.file_cache[tuned_file].get( (cu_num, M, N, K, str(q_dtype_w)), None ) if config is not None: @@ -456,13 +462,13 @@ def gemm_a8w8_bpreshuffle( m, n, k, dtypes.fp8, AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE ) if cktile_config is not None and ck_config is not None: - cktile_time = cktile_config.get("time", float("inf")) - ck_time = ck_config.get("time", float("inf")) + cktile_time = cktile_config.get("us", float("inf")) + ck_time = ck_config.get("us", float("inf")) if AITER_LOG_TUNED_CONFIG: logger.info( f"Both CKTile and CK configs found for M:{m}, N:{n}, K:{k} - " - f"CKTile time: {cktile_time:.6f}ms, CK time: {ck_time:.6f}ms" + f"CKTile time: {cktile_time:.6f}us, CK time: {ck_time:.6f}us" ) if cktile_time <= ck_time: From eb066a9b2e7c9b7536eddbb27d20fd93ffc70c79 Mon Sep 17 00:00:00 2001 From: Ying Zhou Date: Wed, 19 Nov 2025 14:36:54 +0800 Subject: [PATCH 13/13] add get_padded_M --- aiter/ops/gemm_op_a8w8.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/aiter/ops/gemm_op_a8w8.py b/aiter/ops/gemm_op_a8w8.py index 01b019c020..6cd2da4758 100644 --- a/aiter/ops/gemm_op_a8w8.py +++ b/aiter/ops/gemm_op_a8w8.py @@ -286,15 +286,20 @@ def get_bpreshuffle_GEMM_config( ).to_dict("index") cu_num = get_cu_num() - config = get_bpreshuffle_GEMM_config.file_cache[tuned_file].get( - (cu_num, M, N, K, str(q_dtype_w)), None - ) - if config is not None: - if AITER_LOG_TUNED_CONFIG: - logger.info( - f"shape M:{M}, N:{N}, K:{K} q_dtype_w:{q_dtype_w} is tuned, in {tuned_file}!" - ) - else: + padded_M = M + config = None + for gl in [None, 0, 1]: + padded_M = M if gl is None else get_padded_m(M, N, K, gl) + config = get_bpreshuffle_GEMM_config.file_cache[tuned_file].get( + (cu_num, padded_M, N, K, str(q_dtype_w)), None + ) + if config is not None: + if AITER_LOG_TUNED_CONFIG: + logger.info( + f"shape M:{M}, N:{N}, K:{K} q_dtype_w:{q_dtype_w}, found padded_M: {padded_M}, N:{N}, K:{K} is tuned, in {tuned_file}!" + ) + break + if config is None: logger.info( f"shape is M:{M}, N:{N}, K:{K}, q_dtype_w:{q_dtype_w}, not found tuned config in {tuned_file}, will use default config!" )