diff --git a/aiter/configs/a4w4_blockscale_tuned_gemm.csv b/aiter/configs/a4w4_blockscale_tuned_gemm.csv index 7e459e9534..aad51a13db 100644 --- a/aiter/configs/a4w4_blockscale_tuned_gemm.csv +++ b/aiter/configs/a4w4_blockscale_tuned_gemm.csv @@ -1,1471 +1,1471 @@ -cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,1,512,4096,21,0,5.4049,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,0.78,194.57,0.0 -256,1,800,5120,29,0,8.5995,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,0.95,238.64,0.0 -256,1,1024,3072,21,0,5.7795,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1.09,272.77,0.0 -256,1,1024,4096,29,0,6.7429,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1.24,311.62,0.0 -256,1,1280,8192,21,0,16.726,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1.25,313.85,0.0 -256,1,1536,3072,29,0,6.5636,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1.44,360.15,0.0 -256,1,1536,7168,21,0,12.1915,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1.81,452.09,0.0 -256,1,2048,6144,29,0,10.9087,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2.31,577.39,0.0 -256,1,2048,7168,21,0,12.2956,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,2.39,597.59,0.0 -256,1,2112,7168,21,0,12.3647,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,2.45,612.81,0.0 -256,1,2304,16384,21,0,22.5794,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.34,836.48,0.0 -256,1,2560,8192,21,0,17.5745,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,2.39,597.17,0.0 -256,1,3072,1536,21,0,5.7648,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1.64,410.46,0.0 -256,1,3072,6144,21,0,10.8323,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.48,872.06,0.0 -256,1,4096,1024,29,0,4.9501,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1.69,425.42,0.0 -256,1,4096,4096,21,0,8.4223,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.98,997.22,0.0 -256,1,4096,8192,21,0,12.3973,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.41,1354.29,0.0 -256,1,4096,14336,21,0,20.9628,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.6,1401.32,0.0 -256,1,4608,16384,21,0,24.1342,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.26,1564.84,0.0 -256,1,5120,1280,29,0,6.2868,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2.08,522.95,0.0 -256,1,5120,5120,21,0,10.3166,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.08,1271.74,0.0 -256,1,5120,6400,21,0,13.1399,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,4.99,1247.91,0.0 -256,1,5120,25600,21,0,35.9675,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,7.29,1822.73,0.0 -256,1,6144,2048,29,0,6.2966,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,4.0,1001.3,0.0 -256,1,6144,3072,21,0,7.0757,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.33,1335.7,0.0 -256,1,6144,4096,21,0,8.4464,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.96,1491.43,0.0 -256,1,6144,12288,21,0,18.4152,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,8.2,2050.87,0.0 -256,1,6144,16384,21,0,24.3831,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,8.26,2065.04,0.0 -256,1,6400,5120,29,0,12.2475,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,5.35,1339.0,0.0 -256,1,7168,2048,29,0,6.3797,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,4.6,1152.94,0.0 -256,1,7168,8192,21,0,13.6832,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,8.58,2147.05,0.0 -256,1,7168,16384,21,0,24.8921,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.44,2359.9,0.0 -256,1,7168,18432,21,0,27.4078,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.64,2411.13,0.0 -256,1,8192,1024,21,0,5.9074,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,2.84,712.87,0.0 -256,1,8192,2048,21,0,6.6356,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.06,1266.81,0.0 -256,1,8192,3584,21,0,8.4146,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.98,1746.75,0.0 -256,1,8192,4096,37,0,8.8326,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,7.6,1901.55,0.0 -256,1,8192,7168,42,0,17.189,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,6.83,1709.24,0.0 -256,1,8192,8192,21,0,14.1316,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.5,2375.87,0.0 -256,1,8192,28672,21,0,43.2883,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,10.85,2713.69,0.0 -256,1,9216,16384,21,0,26.0135,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.61,2903.27,0.0 -256,1,10240,8192,21,0,14.6562,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.45,2863.47,0.0 -256,1,12288,512,29,0,4.3707,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2.88,725.41,0.0 -256,1,12288,1536,29,0,5.8524,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,6.45,1616.86,0.0 -256,1,12288,4096,21,0,9.5064,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,10.59,2650.05,0.0 -256,1,12288,6144,21,0,12.76,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.83,2960.53,0.0 -256,1,12800,5120,21,0,14.2687,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.19,2298.47,0.0 -256,1,13312,16384,21,0,33.2284,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,13.13,3282.94,0.0 -256,1,14336,8192,37,0,17.338,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,13.55,3388.69,0.0 -256,1,16384,512,29,0,4.4281,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,3.79,954.66,0.0 -256,1,16384,2048,37,0,6.2861,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,10.68,2674.31,0.0 -256,1,16384,4096,21,0,11.8228,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.35,2841.06,0.0 -256,1,16384,6656,37,0,15.0912,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,14.45,3615.49,0.0 -256,1,16384,8192,29,0,19.807,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,13.55,3390.0,0.0 -256,1,16384,13312,21,0,24.7664,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,17.61,4404.81,0.0 -256,1,16384,16384,29,0,29.8545,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,17.98,4497.1,0.0 -256,1,16384,26624,29,0,47.945,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,18.2,4550.0,0.0 -256,1,16384,53248,21,0,85.9036,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,20.31,5078.56,0.0 -256,1,18432,7168,21,0,14.9154,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,17.72,4431.71,0.0 -256,1,18432,16384,0,0,48.5148,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,12.45,3113.28,0.0 -256,1,20480,16384,21,0,34.0333,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,19.72,4931.09,0.0 -256,1,24576,1536,21,0,6.1017,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,12.37,3101.48,0.0 -256,1,26624,16384,46,0,44.6996,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,19.52,4880.7,0.0 -256,1,28672,4096,21,0,14.0433,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.73,4185.6,0.0 -256,1,32768,512,29,0,5.0283,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,6.67,1681.36,0.0 -256,1,51200,5120,14,0,23.3643,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,22.44,5614.42,0.0 -256,1,53248,16384,0,0,73.6484,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,23.69,5924.4,0.0 -256,1,57344,8192,14,0,44.7024,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,21.02,5256.98,0.0 -256,1,59136,8192,14,0,46.5332,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,20.82,5207.97,0.0 -256,1,106496,16384,3,0,145.2761,a4w4_blockscale_256x64x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,24.02,6006.74,0.0 -256,2,512,4096,21,0,5.4335,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1.54,194.11,0.0 -256,2,1024,3072,21,0,5.8351,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,2.16,270.78,0.0 -256,2,1024,4096,29,0,6.8239,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2.46,308.53,0.0 -256,2,1536,3072,29,0,6.3888,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2.95,370.73,0.0 -256,2,1536,7168,21,0,12.3332,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.57,447.44,0.0 -256,2,2048,6144,21,0,10.8043,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,4.66,583.64,0.0 -256,2,2048,7168,21,0,12.4844,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,4.7,589.17,0.0 -256,2,3072,1536,21,0,5.8068,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.25,408.68,0.0 -256,2,3072,6144,21,0,10.7991,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.99,875.59,0.0 -256,2,4096,1024,29,0,5.0164,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,3.34,421.53,0.0 -256,2,4096,4096,21,0,9.4735,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,7.08,887.64,0.0 -256,2,4096,8192,21,0,12.6033,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,10.65,1333.13,0.0 -256,2,4096,14336,21,0,21.0437,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.16,1396.66,0.0 -256,2,6144,2048,21,0,6.2664,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,8.03,1008.25,0.0 -256,2,6144,3072,21,0,7.4379,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,10.15,1272.51,0.0 -256,2,6144,4096,21,0,9.2767,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,10.85,1359.49,0.0 -256,2,6144,12288,21,0,18.5205,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.31,2040.2,0.0 -256,2,6144,16384,21,0,24.4987,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.44,2056.13,0.0 -256,2,7168,2048,21,0,6.2888,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.34,1172.04,0.0 -256,2,7168,8192,21,0,13.8337,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.98,2125.03,0.0 -256,2,7168,16384,21,0,25.1389,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,18.69,2337.62,0.0 -256,2,7168,18432,21,0,27.5631,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,19.17,2398.4,0.0 -256,2,8192,4096,21,0,8.3685,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.04,2009.21,0.0 -256,2,8192,8192,21,0,14.1231,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,19.01,2378.75,0.0 -256,2,8192,28672,21,0,44.8066,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,20.97,2622.43,0.0 -256,2,10240,8192,21,0,14.7818,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.7,2840.8,0.0 -256,2,12288,512,29,0,4.3287,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,5.81,738.19,0.0 -256,2,12288,1536,29,0,6.1797,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,12.22,1535.33,0.0 -256,2,12288,4096,21,0,9.7868,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,20.57,2576.85,0.0 -256,2,12288,6144,29,0,11.8365,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,25.51,3193.85,0.0 -256,2,16384,512,21,0,4.3413,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,7.73,981.35,0.0 -256,2,16384,2048,21,0,6.2153,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,21.59,2710.22,0.0 -256,2,16384,16384,29,0,32.719,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,32.82,4104.64,0.0 -256,2,16384,53248,21,0,86.2066,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,40.48,5061.4,0.0 -256,2,18432,7168,21,0,14.9357,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,35.38,4428.4,0.0 -256,2,20480,16384,21,0,32.5723,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,41.21,5153.78,0.0 -256,2,24576,1536,21,0,6.1676,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,24.48,3076.43,0.0 -256,2,28672,4096,29,0,14.0902,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,33.34,4175.88,0.0 -256,2,32768,512,21,0,5.2683,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,12.74,1617.26,0.0 -256,2,57344,8192,14,0,44.6413,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,42.09,5266.84,0.0 -256,2,59136,8192,14,0,46.4974,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,41.67,5214.61,0.0 -256,2,106496,16384,17,0,148.879,a4w4_blockscale_256x32x512x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,46.88,5862.87,0.0 -256,4,512,4096,21,0,5.4783,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.06,193.65,0.0 -256,4,1024,3072,21,0,5.8909,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,4.27,269.43,0.0 -256,4,1024,4096,29,0,6.9926,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,4.8,302.25,0.0 -256,4,1536,3072,29,0,6.8371,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,5.52,347.77,0.0 -256,4,1536,7168,21,0,12.4212,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,7.09,445.34,0.0 -256,4,2048,6144,21,0,10.9315,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.21,578.16,0.0 -256,4,2048,7168,21,0,12.6032,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.32,584.83,0.0 -256,4,3072,1536,21,0,5.7958,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.51,411.84,0.0 -256,4,3072,6144,21,0,10.8927,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,13.86,869.76,0.0 -256,4,4096,1024,29,0,5.0499,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,6.64,422.18,0.0 -256,4,4096,4096,21,0,8.8735,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,15.13,949.97,0.0 -256,4,4096,8192,21,0,12.641,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,21.24,1331.09,0.0 -256,4,4096,14336,21,0,21.2148,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.14,1386.84,0.0 -256,4,6144,2048,21,0,6.4043,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,15.72,990.69,0.0 -256,4,6144,3072,21,0,7.677,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,19.67,1236.48,0.0 -256,4,6144,4096,21,0,8.9851,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.41,1406.8,0.0 -256,4,6144,12288,21,0,18.6823,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,32.33,2024.51,0.0 -256,4,6144,16384,21,0,24.7138,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,32.59,2039.9,0.0 -256,4,7168,2048,37,0,6.45,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,18.21,1147.52,0.0 -256,4,7168,8192,21,0,13.9848,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,33.59,2104.7,0.0 -256,4,7168,16384,21,0,25.2649,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,37.19,2327.75,0.0 -256,4,7168,18432,21,0,27.7628,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,38.07,2382.85,0.0 -256,4,8192,4096,21,0,8.9554,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,29.97,1881.65,0.0 -256,4,8192,8192,21,0,14.12,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,38.02,2382.18,0.0 -256,4,8192,28672,29,0,45.3869,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,41.4,2590.25,0.0 -256,4,10240,8192,21,0,14.8201,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,45.28,2836.78,0.0 -256,4,12288,512,29,0,4.3638,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,11.53,743.63,0.0 -256,4,12288,1536,21,0,5.9719,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,25.28,1597.24,0.0 -256,4,12288,4096,37,0,9.9657,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,40.4,2535.93,0.0 -256,4,12288,6144,21,0,11.6196,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,51.98,3258.23,0.0 -256,4,16384,512,29,0,4.4621,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,15.04,969.59,0.0 -256,4,16384,2048,21,0,6.5496,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,40.99,2582.2,0.0 -256,4,16384,4096,22,0,21.86,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,24.56,1541.34,0.0 -256,4,16384,16384,21,0,32.3766,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,66.33,4150.58,0.0 -256,4,16384,53248,21,0,87.0063,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,80.22,5016.25,0.0 -256,4,18432,7168,21,0,14.8505,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,71.17,4459.25,0.0 -256,4,18432,16384,0,0,47.9528,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,50.38,3152.58,0.0 -256,4,20480,16384,21,0,32.6683,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,82.17,5141.64,0.0 -256,4,24576,1536,21,0,6.1764,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,48.89,3088.21,0.0 -256,4,28672,4096,21,0,13.9716,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,67.25,4219.83,0.0 -256,4,32768,512,21,0,5.2987,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,25.33,1632.81,0.0 -256,4,57344,8192,14,0,44.7537,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,83.97,5258.92,0.0 -256,4,59136,8192,14,0,46.6928,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,83.0,5198.03,0.0 -256,4,106496,16384,3,0,147.4268,a4w4_blockscale_256x64x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,94.68,5923.62,0.0 -256,8,512,4096,21,0,5.5567,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.04,193.13,0.0 -256,8,512,7168,29,0,9.6677,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,6.07,193.62,0.0 -256,8,1024,3072,21,0,6.0269,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,8.35,265.73,0.0 -256,8,1024,4096,29,0,6.937,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,9.67,307.04,0.0 -256,8,1536,3072,29,0,6.7419,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,11.2,355.41,0.0 -256,8,1536,7168,21,0,12.6568,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,13.92,439.15,0.0 -256,8,2048,6144,29,0,11.2567,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,17.89,564.0,0.0 -256,8,2048,7168,21,0,12.8216,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,18.32,577.27,0.0 -256,8,2112,7168,21,0,12.8435,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,18.86,594.22,0.0 -256,8,3072,1536,42,0,5.4682,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,13.81,441.57,0.0 -256,8,3072,6144,21,0,11.0931,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,27.22,857.37,0.0 -256,8,4096,1024,29,0,5.0957,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,13.17,425.22,0.0 -256,8,4096,4096,21,0,9.0927,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,29.52,931.57,0.0 -256,8,4096,8192,21,0,12.8815,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,41.68,1310.06,0.0 -256,8,4096,14336,21,0,21.6872,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,43.32,1359.47,0.0 -256,8,6144,2048,29,0,6.4847,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,31.05,986.62,0.0 -256,8,6144,3072,21,0,7.2755,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,41.51,1312.32,0.0 -256,8,6144,4096,21,0,9.0955,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,44.27,1396.03,0.0 -256,8,6144,12288,21,0,19.0255,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,63.49,1991.86,0.0 -256,8,6144,16384,21,0,25.2355,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,63.82,2000.97,0.0 -256,8,7168,2048,29,0,5.836,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,40.25,1278.77,0.0 -256,8,7168,8192,21,0,14.1279,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,66.5,2088.6,0.0 -256,8,7168,16384,21,0,25.6172,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,73.35,2299.26,0.0 -256,8,7168,18432,21,0,28.2761,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,74.76,2342.92,0.0 -256,8,8192,4096,21,0,8.5155,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,63.05,1987.51,0.0 -256,8,8192,8192,21,0,14.4423,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,74.35,2334.69,0.0 -256,8,8192,28672,21,0,45.9253,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,81.83,2562.56,0.0 -256,8,10240,8192,21,0,15.0535,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,89.16,2799.33,0.0 -256,8,12288,512,29,0,4.4584,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,22.58,750.13,0.0 -256,8,12288,1536,21,0,5.8293,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,51.81,1653.7,0.0 -256,8,12288,4096,29,0,9.7782,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,82.36,2595.45,0.0 -256,8,12288,6144,21,0,11.7431,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,102.87,3233.38,0.0 -256,8,16384,512,42,0,4.8202,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,27.84,924.96,0.0 -256,8,16384,2048,21,0,6.3214,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,84.93,2696.8,0.0 -256,8,16384,16384,21,0,29.9374,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,143.46,4494.22,0.0 -256,8,16384,53248,21,0,88.6774,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,157.41,4924.4,0.0 -256,8,18432,7168,21,0,15.1166,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,139.84,4391.46,0.0 -256,8,18432,16384,22,0,59.6071,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,81.06,2539.22,0.0 -256,8,20480,16384,21,0,32.719,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,164.09,5139.69,0.0 -256,8,24576,1536,21,0,6.2605,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,96.47,3078.62,0.0 -256,8,26624,16384,24,0,82.4334,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,84.67,2651.78,0.0 -256,8,28672,4096,29,0,13.8939,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,135.24,4260.53,0.0 -256,8,32768,512,21,0,5.0403,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,53.26,1768.73,0.0 -256,8,57344,8192,14,0,44.7964,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,167.79,5264.51,0.0 -256,8,59136,8192,14,0,46.5871,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,166.38,5220.33,0.0 -256,8,106496,16384,3,0,147.25,a4w4_blockscale_256x64x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,189.59,5936.74,0.0 -256,16,512,4096,21,0,5.6787,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.82,193.31,0.0 -256,16,512,7168,4,0,16.8692,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,6.96,113.15,0.0 -256,16,800,5120,22,0,21.2953,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,6.15,99.3,0.0 -256,16,1024,3072,21,0,6.1362,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.4,265.67,0.0 -256,16,1024,4096,29,0,7.1983,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,18.65,300.44,0.0 -256,16,1280,8192,42,0,16.8681,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,19.89,317.13,0.0 -256,16,1536,3072,21,0,6.7767,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.28,359.03,0.0 -256,16,1536,7168,21,0,13.1256,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,26.84,427.52,0.0 -256,16,2048,6144,21,0,11.4627,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,35.13,558.87,0.0 -256,16,2048,7168,21,0,13.274,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,35.39,562.22,0.0 -256,16,2304,16384,21,0,24.026,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,50.28,794.11,0.0 -256,16,2560,8192,37,0,17.9374,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,37.41,592.8,0.0 -256,16,3072,1536,21,0,6.009,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,25.13,411.03,0.0 -256,16,3072,6144,21,0,11.4387,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,52.8,837.91,0.0 -256,16,4096,1024,29,0,5.2016,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,25.8,429.95,0.0 -256,16,4096,4096,21,0,9.4039,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,57.09,909.46,0.0 -256,16,4096,8192,21,0,13.3131,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,80.65,1274.97,0.0 -256,16,4096,14336,21,0,22.4402,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,83.74,1319.32,0.0 -256,16,4608,7168,0,0,22.3101,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,47.38,749.43,0.0 -256,16,4608,16384,21,0,25.737,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,93.87,1477.53,0.0 -256,16,5120,1280,29,0,6.4902,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,32.31,531.71,0.0 -256,16,5120,5120,37,0,14.0232,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,59.82,949.28,0.0 -256,16,5120,6400,46,0,14.2958,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,73.35,1161.11,0.0 -256,16,5120,25600,21,0,39.0242,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,107.48,1688.81,0.0 -256,16,6144,2048,21,0,6.5528,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,61.45,992.62,0.0 -256,16,6144,3072,29,0,7.6676,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,78.77,1259.63,0.0 -256,16,6144,4096,29,0,9.8679,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,81.61,1298.38,0.0 -256,16,6144,12288,21,0,19.6767,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,122.78,1933.44,0.0 -256,16,6144,16384,21,0,25.9976,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,123.9,1948.62,0.0 -256,16,6400,5120,29,0,13.8091,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,75.93,1204.26,0.0 -256,16,7168,2048,21,0,6.4858,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,72.43,1169.6,0.0 -256,16,7168,2304,0,0,9.5571,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,55.3,889.95,0.0 -256,16,7168,8192,21,0,14.6268,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,128.47,2027.45,0.0 -256,16,7168,16384,21,0,26.4865,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,141.89,2230.6,0.0 -256,16,7168,18432,21,0,29.2062,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,144.76,2274.76,0.0 -256,16,8192,1024,21,0,5.453,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,49.23,818.75,0.0 -256,16,8192,2048,21,0,7.2534,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,74.02,1194.91,0.0 -256,16,8192,3584,21,0,8.9059,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,105.49,1681.01,0.0 -256,16,8192,4096,21,0,8.6726,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,123.81,1968.51,0.0 -256,16,8192,7168,21,0,16.4747,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,114.06,1801.53,0.0 -256,16,8192,8192,21,0,14.8938,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,144.19,2274.91,0.0 -256,16,8192,28672,21,0,46.3894,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,162.02,2542.22,0.0 -256,16,9216,16384,21,0,27.5351,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,175.48,2757.33,0.0 -256,16,10240,8192,21,0,15.5584,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,172.53,2721.12,0.0 -256,16,12288,512,21,0,4.4122,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,45.63,803.01,0.0 -256,16,12288,1536,21,0,6.3038,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,95.81,1561.39,0.0 -256,16,12288,4096,42,0,10.1125,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,159.27,2530.71,0.0 -256,16,12288,6144,21,0,12.4264,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,194.42,3073.38,0.0 -256,16,12800,5120,21,0,15.1771,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,138.18,2188.73,0.0 -256,16,13312,16384,21,0,29.6788,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,235.16,3693.17,0.0 -256,16,14336,8192,21,0,18.0277,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,208.46,3286.31,0.0 -256,16,16384,512,21,0,4.4994,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,59.66,1049.63,0.0 -256,16,16384,2048,21,0,6.429,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,167.02,2693.71,0.0 -256,16,16384,4096,21,0,13.7825,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,155.81,2474.99,0.0 -256,16,16384,6656,29,0,18.0628,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,193.2,3050.66,0.0 -256,16,16384,8192,21,0,20.1001,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,213.68,3368.08,0.0 -256,16,16384,13312,37,0,26.8849,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,259.6,4079.71,0.0 -256,16,16384,16384,21,0,30.6345,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,280.4,4402.65,0.0 -256,16,16384,26624,37,0,49.6732,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,281.01,4405.62,0.0 -256,16,16384,53248,21,0,91.0136,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,306.74,4803.21,0.0 -256,16,18432,7168,21,0,15.5712,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,271.52,4284.03,0.0 -256,16,18432,16384,22,0,60.0972,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,160.8,2524.51,0.0 -256,16,20480,16384,21,0,33.3906,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,321.57,5048.09,0.0 -256,16,24576,1536,21,0,6.4818,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,186.36,3035.13,0.0 -256,16,26624,16384,42,0,45.9082,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,304.06,4772.28,0.0 -256,16,28672,4096,21,0,14.2242,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,264.2,4195.0,0.0 -256,16,32768,512,21,0,5.0649,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,106.0,1864.06,0.0 -256,16,51200,5120,14,0,24.0294,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,349.1,5524.54,0.0 -256,16,53248,16384,12,0,74.0858,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,376.82,5912.64,0.0 -256,16,57344,8192,14,0,44.8301,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,335.32,5281.75,0.0 -256,16,59136,8192,14,0,47.0904,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,329.2,5185.32,0.0 -256,16,106496,16384,17,0,145.7428,a4w4_blockscale_256x32x512x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,383.1,6010.27,0.0 -256,32,512,4096,21,0,5.9227,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.66,193.64,0.0 -256,32,512,7168,4,0,17.0396,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,13.78,116.34,0.0 -256,32,800,5120,22,0,21.989,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,11.92,99.19,0.0 -256,32,1024,3072,29,0,6.3604,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,31.65,265.32,0.0 -256,32,1024,4096,29,0,7.9175,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,33.9,281.43,0.0 -256,32,1280,8192,21,0,16.8907,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,39.73,323.01,0.0 -256,32,1536,3072,29,0,7.4775,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,40.39,335.24,0.0 -256,32,1536,7168,21,0,13.9479,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,50.52,409.96,0.0 -256,32,2048,6144,29,0,12.2392,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,65.8,532.78,0.0 -256,32,2048,7168,21,0,14.1868,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,66.23,534.71,0.0 -256,32,2304,16384,21,0,25.8248,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,93.55,746.72,0.0 -256,32,2560,8192,29,0,17.301,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,77.58,623.12,0.0 -256,32,3072,1536,29,0,6.1627,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,49.0,418.73,0.0 -256,32,3072,6144,21,0,12.2126,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,98.91,796.89,0.0 -256,32,4096,1024,14,0,5.3091,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,50.56,447.47,0.0 -256,32,4096,4096,21,0,10.0025,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,107.35,871.41,0.0 -256,32,4096,8192,21,0,14.4168,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,148.96,1191.0,0.0 -256,32,4096,14336,21,0,24.1615,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,155.54,1235.5,0.0 -256,32,4608,7168,0,0,22.2165,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,95.15,761.81,0.0 -256,32,4608,16384,21,0,27.3808,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,176.47,1399.0,0.0 -256,32,5120,1280,29,0,6.1505,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,68.19,589.38,0.0 -256,32,5120,5120,29,0,14.5083,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,115.64,931.66,0.0 -256,32,5120,6400,29,0,16.8268,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,124.63,999.24,0.0 -256,32,5120,25600,21,0,42.0705,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,199.39,1575.29,0.0 -256,32,6144,2048,29,0,6.6444,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,121.2,1010.99,0.0 -256,32,6144,3072,29,0,9.4195,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,128.24,1048.84,0.0 -256,32,6144,4096,46,0,10.1863,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,158.12,1280.31,0.0 -256,32,6144,12288,21,0,21.2506,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,227.37,1804.12,0.0 -256,32,6144,16384,21,0,27.663,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,232.89,1843.15,0.0 -256,32,6400,5120,21,0,15.6923,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,133.64,1075.4,0.0 -256,32,7168,2048,42,0,6.821,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,137.74,1148.15,0.0 -256,32,7168,2304,0,0,9.7787,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,108.09,895.12,0.0 -256,32,7168,8192,21,0,15.6065,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,240.8,1919.07,0.0 -256,32,7168,16384,21,0,28.1831,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,266.69,2109.11,0.0 -256,32,7168,18432,21,0,31.3603,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,269.63,2130.53,0.0 -256,32,8192,1024,21,0,5.8783,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,91.33,805.5,0.0 -256,32,8192,2048,21,0,7.2694,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,147.71,1230.59,0.0 -256,32,8192,3584,21,0,9.4337,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,199.18,1617.78,0.0 -256,32,8192,4096,29,0,9.5391,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,225.12,1820.62,0.0 -256,32,8192,7168,21,0,18.6983,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,200.99,1604.38,0.0 -256,32,8192,8192,21,0,15.8689,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,270.65,2155.78,0.0 -256,32,8192,28672,21,0,50.1679,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,299.64,2360.54,0.0 -256,32,9216,16384,21,0,29.3716,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,329.01,2599.43,0.0 -256,32,10240,8192,21,0,16.5466,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,324.46,2582.37,0.0 -256,32,12288,512,21,0,4.5819,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,87.88,859.98,0.0 -256,32,12288,1536,21,0,6.3551,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,190.08,1612.59,0.0 -256,32,12288,4096,21,0,9.7836,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,329.25,2659.33,0.0 -256,32,12288,6144,21,0,13.449,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,359.27,2872.59,0.0 -256,32,12800,5120,21,0,17.2457,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,243.21,1952.32,0.0 -256,32,13312,16384,21,0,32.0127,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,436.03,3441.32,0.0 -256,32,14336,8192,29,0,18.6847,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,402.26,3198.81,0.0 -256,32,16384,512,21,0,4.6435,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,115.62,1130.84,0.0 -256,32,16384,2048,21,0,6.8106,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,315.31,2622.17,0.0 -256,32,16384,4096,21,0,13.3393,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,321.98,2598.98,0.0 -256,32,16384,6656,21,0,18.0028,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,387.68,3092.91,0.0 -256,32,16384,8192,21,0,20.7777,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,413.42,3286.63,0.0 -256,32,16384,13312,21,0,27.8363,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,501.45,3962.94,0.0 -256,32,16384,16384,21,0,33.0522,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,519.78,4100.44,0.0 -256,32,16384,26624,29,0,51.4954,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,542.13,4264.04,0.0 -256,32,16384,53248,21,0,96.3502,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,579.5,4547.04,0.0 -256,32,18432,7168,21,0,17.1967,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,491.71,3916.72,0.0 -256,32,18432,16384,22,0,63.2576,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,305.53,2409.78,0.0 -256,32,20480,16384,29,0,35.5786,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,603.59,4759.74,0.0 -256,32,24576,1536,21,0,6.3941,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,377.84,3201.67,0.0 -256,32,26624,16384,37,0,44.8726,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,622.15,4904.33,0.0 -256,32,28672,4096,21,0,14.8004,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,507.84,4095.89,0.0 -256,32,32768,512,43,0,5.7265,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,187.5,1832.52,0.0 -256,32,51200,5120,14,0,25.3094,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,662.88,5311.49,0.0 -256,32,53248,16384,12,0,75.1943,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,742.54,5849.88,0.0 -256,32,57344,8192,14,0,45.7848,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,656.65,5213.13,0.0 -256,32,59136,8192,15,0,47.9633,a4w4_blockscale_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,646.42,5131.77,0.0 -256,32,106496,16384,17,0,142.9821,a4w4_blockscale_256x32x512x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,781.0,6151.07,0.0 -256,64,192,1024,21,0,3.8719,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.5,40.2,0.0 -256,64,512,4096,21,0,6.2843,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,42.72,198.14,0.0 -256,64,512,7168,4,0,18.2008,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,25.81,117.02,0.0 -256,64,800,5120,22,0,25.8477,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,20.28,89.53,0.0 -256,64,1024,3072,21,0,6.3028,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,63.88,285.94,0.0 -256,64,1024,4096,21,0,8.9889,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,59.73,262.47,0.0 -256,64,1280,8192,21,0,17.9567,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,74.75,315.7,0.0 -256,64,1536,3072,21,0,6.9252,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,87.21,383.27,0.0 -256,64,1536,7168,21,0,13.372,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,105.39,443.54,0.0 -256,64,2048,6144,21,0,11.8464,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,135.96,569.81,0.0 -256,64,2048,7168,21,0,13.7284,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,136.87,570.46,0.0 -256,64,2304,16384,21,0,25.9264,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,186.37,759.6,0.0 -256,64,2560,8192,21,0,17.5991,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,152.53,629.33,0.0 -256,64,3072,1536,21,0,6.149,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,98.22,455.63,0.0 -256,64,3072,6144,21,0,11.8707,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,203.52,844.69,0.0 -256,64,4096,1024,21,0,5.4105,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,99.23,490.57,0.0 -256,64,4096,4096,29,0,9.9442,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,215.95,909.47,0.0 -256,64,4096,8192,21,0,14.0415,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,305.88,1250.84,0.0 -256,64,4096,14336,21,0,23.3334,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,322.12,1300.42,0.0 -256,64,4608,7168,0,0,23.1497,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,182.63,748.79,0.0 -256,64,4608,16384,21,0,26.8437,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,360.0,1447.75,0.0 -256,64,5120,1280,21,0,6.7463,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,124.34,588.93,0.0 -256,64,5120,5120,29,0,13.4294,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,249.86,1037.01,0.0 -256,64,5120,6400,37,0,15.43,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,271.83,1117.57,0.0 -256,64,5120,25600,21,0,40.7807,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,411.4,1643.19,0.0 -256,64,6144,2048,21,0,6.6403,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,242.55,1075.77,0.0 -256,64,6144,3072,21,0,8.6835,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,278.22,1188.68,0.0 -256,64,6144,4096,29,0,9.8189,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,328.06,1374.94,0.0 -256,64,6144,12288,21,0,20.6497,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,467.98,1885.18,0.0 -256,64,6144,16384,21,0,27.112,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,475.25,1904.78,0.0 -256,64,6400,5120,21,0,15.5961,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,268.93,1113.55,0.0 -256,64,7168,2048,21,0,6.8112,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,275.88,1221.97,0.0 -256,64,7168,2304,0,0,10.2539,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,206.16,901.98,0.0 -256,64,7168,8192,21,0,15.3196,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,490.63,1993.51,0.0 -256,64,7168,16384,21,0,27.6557,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,543.55,2175.39,0.0 -256,64,7168,18432,21,0,30.5745,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,553.12,2209.93,0.0 -256,64,8192,1024,21,0,5.6831,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,188.94,928.3,0.0 -256,64,8192,2048,21,0,7.7271,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,277.92,1229.79,0.0 -256,64,8192,3584,21,0,9.2243,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,407.41,1717.56,0.0 -256,64,8192,4096,29,0,9.9642,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,431.04,1802.14,0.0 -256,64,8192,7168,37,0,17.772,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,422.92,1723.95,0.0 -256,64,8192,8192,21,0,15.5361,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,552.9,2244.14,0.0 -256,64,8192,28672,21,0,48.3134,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,622.29,2471.5,0.0 -256,64,9216,16384,21,0,29.2875,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,659.92,2635.98,0.0 -256,64,10240,8192,21,0,16.349,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,656.76,2661.69,0.0 -256,64,12288,512,21,0,4.5808,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,175.8,1033.66,0.0 -256,64,12288,1536,21,0,6.3268,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,381.85,1747.99,0.0 -256,64,12288,4096,21,0,10.0,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,644.25,2686.98,0.0 -256,64,12288,6144,21,0,13.4251,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,719.82,2943.61,0.0 -256,64,12800,5120,21,0,16.7255,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,501.55,2066.92,0.0 -256,64,13312,16384,21,0,35.8494,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,778.74,3104.1,0.0 -256,64,14336,8192,21,0,18.4161,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,816.26,3302.4,0.0 -256,64,16384,512,21,0,5.3495,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,200.72,1179.15,0.0 -256,64,16384,2048,21,0,6.8013,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,631.49,2784.75,0.0 -256,64,16384,4096,21,0,14.4039,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,596.36,2484.23,0.0 -256,64,16384,6656,21,0,18.7155,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,745.83,3036.85,0.0 -256,64,16384,8192,21,0,20.7077,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,829.64,3354.7,0.0 -256,64,16384,13312,21,0,32.0337,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,871.5,3483.05,0.0 -256,64,16384,16384,21,0,32.8484,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1046.01,4165.78,0.0 -256,64,16384,26624,21,0,51.1923,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1090.68,4318.09,0.0 -256,64,16384,53248,21,0,96.564,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1156.43,4556.65,0.0 -256,64,18432,7168,29,0,20.1172,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,840.65,3412.45,0.0 -256,64,18432,16384,22,0,63.2113,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,611.52,2434.35,0.0 -256,64,20480,16384,29,0,41.0443,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1046.42,4164.23,0.0 -256,64,24576,1536,29,0,7.7228,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,625.66,2857.67,0.0 -256,64,26624,16384,29,0,46.4119,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1203.02,4784.03,0.0 -256,64,28672,4096,29,0,16.0086,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,939.02,3905.48,0.0 -256,64,32768,512,46,0,5.601,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,383.41,2249.47,0.0 -256,64,51200,5120,0,0,28.5654,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1174.65,4823.65,0.0 -256,64,53248,16384,0,0,81.0859,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1377.17,5470.1,0.0 -256,64,57344,8192,12,0,48.14,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,1249.06,5037.04,0.0 -256,64,59136,8192,1,0,50.0197,a4w4_blockscale_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1239.68,4999.08,0.0 -256,64,106496,16384,3,0,150.9403,a4w4_blockscale_256x64x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1479.65,5873.65,0.0 -256,65,1280,8192,0,0,23.9233,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,56.98,237.24,0.0 -256,112,5120,1280,21,0,6.6107,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,222.07,680.01,0.0 -256,112,5120,5120,21,0,14.0483,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,417.99,1035.06,0.0 -256,112,5120,6400,29,0,14.1005,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,520.55,1268.7,0.0 -256,112,5120,25600,21,0,38.6535,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,759.57,1762.23,0.0 -256,112,6400,5120,21,0,14.4899,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,506.56,1249.44,0.0 -256,112,8192,7168,29,0,17.7127,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,742.59,1783.84,0.0 -256,112,12800,5120,37,0,17.1116,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,857.9,2099.27,0.0 -256,112,51200,5120,9,0,34.2442,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1714.75,4170.85,0.0 -256,127,1280,8192,0,0,24.5329,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,108.56,248.16,0.0 -256,128,128,49920,21,0,72.1898,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.66,88.97,0.0 -256,128,128,322816,21,0,446.2108,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,23.71,92.68,0.0 -256,128,128,423168,21,0,583.2756,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,23.77,92.92,0.0 -256,128,256,256,29,0,3.4878,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,4.81,32.88,0.0 -256,128,256,1024,29,0,4.1218,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,16.28,63.6,0.0 -256,128,512,4096,21,0,6.4995,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,82.6,221.83,0.0 -256,128,512,7168,0,0,22.7857,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,41.23,106.42,0.0 -256,128,800,5120,24,0,32.7867,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,31.98,78.71,0.0 -256,128,1024,3072,21,0,6.4269,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,125.3,316.11,0.0 -256,128,1024,4096,21,0,8.7663,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,122.49,299.04,0.0 -256,128,1280,8192,21,0,14.3547,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,187.0,424.59,0.0 -256,128,1536,3072,21,0,6.826,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,176.96,432.04,0.0 -256,128,1536,7168,21,0,13.0508,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,215.97,487.1,0.0 -256,128,2048,6144,21,0,11.6,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,277.69,621.46,0.0 -256,128,2048,7168,21,0,13.396,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,280.54,621.31,0.0 -256,128,2304,16384,21,0,26.3592,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,366.61,778.2,0.0 -256,128,2560,8192,21,0,17.4515,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,307.64,668.45,0.0 -256,128,3072,1536,21,0,6.1683,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,195.83,525.92,0.0 -256,128,3072,6144,21,0,11.183,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,432.07,949.37,0.0 -256,128,4096,1024,21,0,5.3851,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,199.39,596.32,0.0 -256,128,4096,4096,21,0,10.0526,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,427.25,964.86,0.0 -256,128,4096,8192,21,0,13.7904,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,622.89,1330.64,0.0 -256,128,4096,14336,21,0,22.8828,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,656.93,1368.98,0.0 -256,128,4608,7168,0,0,23.5505,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,359.05,770.83,0.0 -256,128,4608,16384,21,0,26.1969,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,737.77,1526.02,0.0 -256,128,5120,1280,21,0,6.7817,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,247.39,688.54,0.0 -256,128,5120,5120,21,0,13.8116,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,485.89,1067.62,0.0 -256,128,5120,6400,21,0,15.856,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,529.05,1141.8,0.0 -256,128,5120,25600,21,0,40.0938,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,836.9,1708.12,0.0 -256,128,6144,2048,21,0,6.794,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,474.13,1176.83,0.0 -256,128,6144,3072,21,0,9.1419,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,528.54,1225.86,0.0 -256,128,6144,4096,21,0,10.2408,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,629.1,1407.89,0.0 -256,128,6144,12288,21,0,20.3662,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,948.99,1969.34,0.0 -256,128,6144,16384,21,0,26.9082,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,957.69,1967.92,0.0 -256,128,6400,5120,21,0,14.762,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,568.26,1243.06,0.0 -256,128,7168,2048,21,0,6.8857,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,545.78,1351.51,0.0 -256,128,7168,2304,0,0,10.3883,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,406.98,985.72,0.0 -256,128,7168,8192,21,0,15.244,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,986.12,2080.78,0.0 -256,128,7168,16384,21,0,27.6111,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1088.87,2231.13,0.0 -256,128,7168,18432,21,0,30.4024,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1112.51,2272.02,0.0 -256,128,8192,1024,21,0,5.7823,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,371.39,1099.39,0.0 -256,128,8192,2048,21,0,7.5231,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,570.9,1411.23,0.0 -256,128,8192,3584,21,0,9.4332,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,796.78,1802.84,0.0 -256,128,8192,4096,21,0,9.9528,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,863.07,1922.73,0.0 -256,128,8192,7168,21,0,17.5883,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,854.68,1814.62,0.0 -256,128,8192,8192,21,0,15.5661,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1103.67,2324.02,0.0 -256,128,8192,28672,21,0,48.4668,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1240.63,2504.24,0.0 -256,128,9216,16384,29,0,33.3105,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1160.44,2368.78,0.0 -256,128,10240,8192,29,0,18.6592,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1150.9,2416.44,0.0 -256,128,12288,512,29,0,5.1498,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,312.75,1228.05,0.0 -256,128,12288,1536,37,0,6.7742,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,713.27,1871.99,0.0 -256,128,12288,4096,29,0,11.2189,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1148.5,2546.92,0.0 -256,128,12288,6144,29,0,15.5916,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1239.6,2648.07,0.0 -256,128,12800,5120,0,0,18.1526,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,924.23,2003.71,0.0 -256,128,13312,16384,29,0,38.1269,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1464.44,2977.12,0.0 -256,128,14336,8192,29,0,21.2529,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1414.62,2960.28,0.0 -256,128,16384,512,29,0,5.3611,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,400.57,1570.83,0.0 -256,128,16384,2048,29,0,7.8866,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1089.18,2675.75,0.0 -256,128,16384,4096,37,0,16.0283,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1071.85,2371.49,0.0 -256,128,16384,6656,37,0,19.9661,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1398.23,2962.33,0.0 -256,128,16384,8192,29,0,22.9523,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1497.01,3129.42,0.0 -256,128,16384,13312,29,0,36.5127,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1529.18,3124.89,0.0 -256,128,16384,16384,29,0,38.2318,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1797.44,3647.76,0.0 -256,128,16384,26624,29,0,62.8972,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1775.42,3561.4,0.0 -256,128,16384,53248,29,0,112.9144,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1977.94,3930.5,0.0 -256,128,18432,7168,42,0,24.7415,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1367.05,2879.28,0.0 -256,128,18432,16384,22,0,67.9262,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,1138.14,2307.83,0.0 -256,128,20480,16384,42,0,49.9568,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1719.47,3484.28,0.0 -256,128,24576,1536,8,0,9.4678,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1020.69,2668.43,0.0 -256,128,26624,16384,42,0,55.1181,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2026.0,4099.71,0.0 -256,128,28672,4096,0,0,18.1925,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1652.59,3645.59,0.0 -256,128,32768,512,42,0,7.0236,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,611.51,2393.36,0.0 -256,128,51200,5120,9,0,35.8651,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1871.15,4029.18,0.0 -256,128,53248,16384,9,0,98.9807,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2256.38,4555.31,0.0 -256,128,57344,8192,9,0,52.3976,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2295.13,4772.84,0.0 -256,128,59136,8192,9,0,54.1038,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2292.21,4766.47,0.0 -256,128,106496,16384,11,0,157.4506,a4w4_blockscale_256x128x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2836.93,5720.69,0.0 -256,129,1280,8192,0,0,24.2013,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,111.78,252.11,0.0 -256,160,1280,8192,21,0,16.0131,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,209.54,393.92,0.0 -256,160,2304,16384,21,0,26.1975,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,461.1,798.64,0.0 -256,160,2560,8192,21,0,16.6775,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,402.39,717.15,0.0 -256,160,4608,16384,21,0,26.4423,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,913.66,1532.92,0.0 -256,160,5120,1280,21,0,6.7723,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,309.67,740.9,0.0 -256,160,5120,5120,21,0,14.1087,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,594.57,1074.17,0.0 -256,160,5120,6400,21,0,16.2285,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,646.13,1142.09,0.0 -256,160,5120,25600,21,0,39.9134,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1050.85,1734.31,0.0 -256,160,6400,5120,21,0,15.0264,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,697.82,1253.9,0.0 -256,160,7168,8192,29,0,18.6767,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1006.09,1729.92,0.0 -256,160,8192,1024,29,0,6.5603,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,409.18,1051.43,0.0 -256,160,8192,2048,29,0,7.7321,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,694.34,1445.13,0.0 -256,160,8192,3584,29,0,13.3701,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,702.71,1315.49,0.0 -256,160,8192,7168,37,0,19.1789,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,979.75,1697.44,0.0 -256,160,8192,8192,29,0,17.5633,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1222.71,2097.06,0.0 -256,160,8192,28672,29,0,54.6667,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1374.91,2238.21,0.0 -256,160,9216,16384,29,0,31.4915,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1534.33,2532.66,0.0 -256,160,10240,8192,29,0,19.5071,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1376.09,2351.72,0.0 -256,160,12800,5120,37,0,17.8875,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1172.41,2083.78,0.0 -256,160,13312,16384,37,0,39.7609,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1755.32,2882.79,0.0 -256,160,14336,8192,37,0,22.5625,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1665.64,2834.93,0.0 -256,160,16384,2048,37,0,9.3976,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1142.57,2360.6,0.0 -256,160,16384,4096,42,0,18.124,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1184.88,2158.74,0.0 -256,160,16384,6656,37,0,21.2181,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1644.66,2841.98,0.0 -256,160,16384,8192,37,0,25.055,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1714.22,2913.87,0.0 -256,160,16384,13312,37,0,36.0049,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1938.44,3204.0,0.0 -256,160,16384,26624,37,0,67.3608,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2072.22,3347.3,0.0 -256,160,26624,16384,46,0,60.9834,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2288.93,3737.64,0.0 -256,160,51200,5120,47,0,42.9748,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,1951.98,3440.75,0.0 -256,160,53248,16384,47,0,102.4961,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2723.74,4434.88,0.0 -256,160,57344,8192,50,0,60.9337,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2467.01,4166.6,0.0 -256,192,1280,8192,21,0,15.967,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,252.18,408.39,0.0 -256,192,2304,16384,21,0,26.2998,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,551.16,811.11,0.0 -256,192,2560,8192,21,0,16.9291,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,475.69,723.92,0.0 -256,192,4608,16384,21,0,26.5132,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1093.46,1549.83,0.0 -256,192,5120,1280,21,0,6.3153,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,398.49,849.64,0.0 -256,192,5120,5120,21,0,14.2691,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,705.46,1090.8,0.0 -256,192,5120,6400,21,0,17.0747,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,736.93,1110.68,0.0 -256,192,5120,25600,21,0,39.9244,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1260.67,1752.3,0.0 -256,192,6400,5120,29,0,17.6335,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,713.58,1096.39,0.0 -256,192,7168,8192,29,0,18.7241,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1204.25,1757.04,0.0 -256,192,8192,1024,29,0,6.5502,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,491.78,1135.59,0.0 -256,192,8192,2048,29,0,8.5152,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,756.58,1377.65,0.0 -256,192,8192,3584,29,0,12.1015,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,931.64,1501.45,0.0 -256,192,8192,7168,29,0,18.3343,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1229.86,1810.49,0.0 -256,192,8192,8192,29,0,18.9505,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1359.85,1978.13,0.0 -256,192,8192,28672,29,0,56.8402,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1586.8,2169.92,0.0 -256,192,9216,16384,29,0,33.7474,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1718.12,2388.61,0.0 -256,192,10240,8192,29,0,19.2596,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1672.53,2422.77,0.0 -256,192,12800,5120,37,0,18.4327,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1365.28,2071.03,0.0 -256,192,13312,16384,37,0,41.6901,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2008.91,2776.12,0.0 -256,192,14336,8192,37,0,23.7812,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1896.34,2733.74,0.0 -256,192,16384,2048,37,0,11.3951,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1130.74,2041.69,0.0 -256,192,16384,4096,46,0,18.5424,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1389.78,2170.11,0.0 -256,192,16384,6656,37,0,21.9033,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1911.85,2805.8,0.0 -256,192,16384,8192,37,0,25.9592,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1985.41,2857.82,0.0 -256,192,16384,13312,37,0,38.9353,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2151.05,2995.26,0.0 -256,192,16384,26624,37,0,69.5808,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2407.33,3261.69,0.0 -256,192,26624,16384,49,0,66.9827,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2500.7,3432.23,0.0 -256,192,51200,5120,54,0,46.5597,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2162.03,3247.97,0.0 -256,192,53248,16384,50,0,112.2081,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2985.59,4083.73,0.0 -256,192,57344,8192,50,0,63.3964,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2845.41,4064.7,0.0 -256,256,512,4096,21,0,6.343,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,169.28,289.3,0.0 -256,256,512,7168,0,0,22.6525,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,82.95,133.08,0.0 -256,256,800,5120,22,0,36.8477,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,56.91,84.48,0.0 -256,256,1024,3072,21,0,6.8291,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,235.85,364.67,0.0 -256,256,1024,4096,21,0,9.0711,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,236.74,346.79,0.0 -256,256,1280,8192,21,0,16.2987,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,329.39,426.22,0.0 -256,256,1536,3072,21,0,6.9951,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,345.37,505.92,0.0 -256,256,1536,7168,21,0,12.9576,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,435.05,556.35,0.0 -256,256,2048,6144,21,0,11.5572,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,557.44,703.15,0.0 -256,256,2048,7168,21,0,13.2113,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,568.92,704.41,0.0 -256,256,2048,8192,21,0,13.9675,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,614.99,750.73,0.0 -256,256,2304,16384,21,0,26.2627,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,735.92,843.45,0.0 -256,256,2560,8192,21,0,16.8359,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,637.77,762.96,0.0 -256,256,3072,1536,21,0,6.1771,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,391.11,668.4,0.0 -256,256,3072,6144,21,0,11.1632,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,865.67,1056.73,0.0 -256,256,4096,1024,21,0,5.5715,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,385.44,776.34,0.0 -256,256,4096,4096,21,0,10.1458,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,846.65,1085.18,0.0 -256,256,4096,8192,21,0,13.9413,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1232.3,1429.06,0.0 -256,256,4096,14336,21,0,23.1245,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1300.13,1439.7,0.0 -256,256,4608,7168,0,0,23.7653,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,711.6,832.81,0.0 -256,256,4608,16384,29,0,30.6036,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1263.08,1379.09,0.0 -256,256,5120,1280,29,0,6.8795,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,487.75,881.18,0.0 -256,256,5120,5120,29,0,15.988,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,839.49,1024.77,0.0 -256,256,5120,6400,29,0,14.7318,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1138.84,1345.7,0.0 -256,256,5120,25600,29,0,47.1285,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1423.96,1515.73,0.0 -256,256,6144,2048,29,0,7.1546,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,900.46,1355.68,0.0 -256,256,6144,3072,37,0,9.9868,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,967.64,1299.33,0.0 -256,256,6144,4096,29,0,10.2418,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1258.07,1586.92,0.0 -256,256,6144,12288,29,0,24.2139,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1596.38,1753.84,0.0 -256,256,6144,16384,29,0,31.4059,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1641.08,1769.56,0.0 -256,256,6400,5120,29,0,17.2608,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,971.98,1177.01,0.0 -256,256,7168,2048,29,0,7.8351,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,959.3,1438.68,0.0 -256,256,7168,2304,0,0,10.6915,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,790.88,1143.19,0.0 -256,256,7168,8192,29,0,17.9205,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1677.67,1901.66,0.0 -256,256,7168,16384,29,0,32.5458,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1847.54,1981.44,0.0 -256,256,7168,18432,29,0,36.2989,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1863.58,1986.0,0.0 -256,256,8192,1024,29,0,6.5497,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,655.75,1300.77,0.0 -256,256,8192,2048,29,0,8.1011,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1060.34,1585.59,0.0 -256,256,8192,3584,29,0,14.0739,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1068.1,1373.69,0.0 -256,256,8192,4096,29,0,10.3885,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1653.74,2069.19,0.0 -256,256,8192,7168,29,0,18.8783,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1592.56,1826.01,0.0 -256,256,8192,8192,29,0,18.4185,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1865.5,2106.43,0.0 -256,256,8192,28672,29,0,58.3537,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2060.86,2147.33,0.0 -256,256,9216,16384,37,0,36.7985,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2100.88,2236.86,0.0 -256,256,10240,8192,37,0,21.2222,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2023.81,2272.83,0.0 -256,256,12288,512,38,0,6.3583,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,506.62,1494.54,0.0 -256,256,12288,1536,1,0,9.8009,a4w4_blockscale_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,986.0,1624.88,0.0 -256,256,12288,4096,42,0,14.5442,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1771.83,2198.92,0.0 -256,256,12288,6144,42,0,20.0605,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1926.91,2234.57,0.0 -256,256,12800,5120,42,0,18.789,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1785.86,2127.68,0.0 -256,256,13312,16384,42,0,46.7414,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2389.08,2523.78,0.0 -256,256,14336,8192,42,0,26.5916,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2261.22,2523.69,0.0 -256,256,16384,512,30,0,6.8027,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x256E,631.36,1859.33,0.0 -256,256,16384,2048,42,0,10.4519,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1643.71,2432.86,0.0 -256,256,16384,4096,38,0,19.0289,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,1805.66,2231.73,0.0 -256,256,16384,6656,42,0,24.1571,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2311.31,2639.66,0.0 -256,256,16384,8192,42,0,28.2495,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2432.59,2709.64,0.0 -256,256,16384,13312,42,0,43.2786,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2580.24,2752.96,0.0 -256,256,16384,16384,42,0,49.8604,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2756.48,2902.17,0.0 -256,256,16384,26624,42,0,78.9684,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2828.2,2911.29,0.0 -256,256,16384,53248,42,0,145.1528,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,3077.29,3109.91,0.0 -256,256,18432,7168,38,0,30.4769,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2219.57,2507.31,0.0 -256,256,18432,16384,2,0,97.5677,a4w4_blockscale_256x64x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1584.73,1665.81,0.0 -256,256,20480,16384,38,0,62.6988,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2740.06,2876.53,0.0 -256,256,24576,1536,8,0,12.7275,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1518.55,2487.05,0.0 -256,256,26624,16384,43,0,73.3227,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3045.96,3189.09,0.0 -256,256,28672,4096,43,0,24.4961,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2454.66,3017.81,0.0 -256,256,32768,512,43,0,9.9975,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,859.21,2523.77,0.0 -256,256,51200,5120,54,0,50.4785,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2658.91,3128.89,0.0 -256,256,53248,16384,54,0,132.8006,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3363.51,3505.77,0.0 -256,256,57344,8192,18,0,71.2376,a4w4_blockscale_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,3376.28,3724.01,0.0 -256,256,59136,8192,18,0,74.1445,a4w4_blockscale_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,3345.28,3689.38,0.0 -256,256,106496,16384,18,0,249.1001,a4w4_blockscale_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,3586.32,3729.58,0.0 -256,288,1280,8192,21,0,16.4719,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,366.67,434.67,0.0 -256,288,2304,16384,21,0,26.1796,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,830.54,861.77,0.0 -256,288,2560,8192,21,0,14.3768,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,840.21,913.97,0.0 -256,288,4608,16384,29,0,29.1664,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1490.98,1466.15,0.0 -256,288,5120,1280,29,0,7.3211,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,515.62,875.58,0.0 -256,288,5120,5120,29,0,13.8399,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1091.01,1213.42,0.0 -256,288,5120,6400,42,0,18.5633,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1016.76,1091.12,0.0 -256,288,5120,25600,29,0,44.3246,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1703.29,1628.25,0.0 -256,288,6400,5120,29,0,16.8247,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1121.82,1236.73,0.0 -256,288,7168,8192,37,0,21.1725,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1597.49,1637.43,0.0 -256,288,8192,1024,37,0,7.5191,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,642.61,1204.98,0.0 -256,288,8192,2048,42,0,10.6831,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,904.58,1254.52,0.0 -256,288,8192,3584,42,0,14.8917,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1135.63,1337.31,0.0 -256,288,8192,7168,37,0,19.5302,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1731.82,1797.78,0.0 -256,288,8192,8192,37,0,21.6423,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1786.07,1822.94,0.0 -256,288,8192,28672,37,0,66.796,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2025.44,1890.65,0.0 -256,288,9216,16384,37,0,39.9597,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2176.52,2081.23,0.0 -256,288,10240,8192,37,0,22.4527,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2152.01,2183.3,0.0 -256,288,12800,5120,46,0,20.6536,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1827.71,1979.22,0.0 -256,288,13312,16384,46,0,50.3598,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2494.6,2364.56,0.0 -256,288,14336,8192,46,0,28.4269,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2379.64,2397.64,0.0 -256,288,16384,2048,46,0,14.2595,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1355.4,1859.06,0.0 -256,288,16384,4096,46,0,18.264,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2116.44,2386.19,0.0 -256,288,16384,6656,46,0,25.5322,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2460.18,2542.73,0.0 -256,288,16384,8192,46,0,30.4004,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2543.04,2556.73,0.0 -256,288,16384,13312,46,0,45.1824,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2780.46,2664.89,0.0 -256,288,16384,26624,46,0,83.936,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2993.42,2756.56,0.0 -256,288,26624,16384,47,0,77.0326,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3261.68,3061.02,0.0 -256,288,51200,5120,41,0,54.2684,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x640E,2782.37,2972.27,0.0 -256,288,53248,16384,41,0,147.3597,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x640E,3410.1,3184.3,0.0 -256,288,57344,8192,47,0,87.9704,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3075.84,3058.88,0.0 -256,320,2560,8192,21,0,16.4166,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,817.57,818.37,0.0 -256,320,5120,1280,29,0,7.4259,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,564.82,910.11,0.0 -256,320,5120,5120,22,0,16.5703,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x256E,1012.49,1038.19,0.0 -256,320,5120,6400,29,0,18.0264,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1163.38,1147.47,0.0 -256,320,5120,25600,29,0,47.2784,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1774.3,1542.12,0.0 -256,320,6400,5120,29,0,16.745,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1252.4,1271.97,0.0 -256,320,7168,8192,37,0,20.6963,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1815.83,1703.61,0.0 -256,320,8192,1024,37,0,7.6808,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,698.98,1250.0,0.0 -256,320,8192,2048,37,0,8.971,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1196.9,1556.03,0.0 -256,320,8192,3584,37,0,14.0025,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1341.94,1463.77,0.0 -256,320,8192,7168,37,0,19.5769,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1919.66,1826.13,0.0 -256,320,8192,8192,37,0,21.16,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2029.76,1895.46,0.0 -256,320,8192,28672,37,0,70.3383,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2137.16,1809.41,0.0 -256,320,10240,8192,42,0,24.4175,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2198.71,2039.82,0.0 -256,320,14336,8192,46,0,30.445,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2468.78,2273.15,0.0 -256,320,51200,5120,47,0,62.2725,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2694.16,2644.17,0.0 -256,320,57344,8192,47,0,88.7921,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3385.97,3073.38,0.0 -256,384,1280,8192,21,0,16.1226,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,499.49,483.72,0.0 -256,384,2304,16384,21,0,26.2829,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1103.04,905.13,0.0 -256,384,2560,8192,21,0,17.85,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,902.3,785.7,0.0 -256,384,4608,16384,29,0,31.6982,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1829.19,1401.76,0.0 -256,384,5120,1280,29,0,7.0087,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,718.13,1063.64,0.0 -256,384,5120,5120,29,0,17.1449,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1174.27,1051.18,0.0 -256,384,5120,6400,29,0,18.6953,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1346.1,1152.43,0.0 -256,384,5120,25600,29,0,48.0141,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2096.54,1549.2,0.0 -256,384,6400,5120,37,0,18.0754,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1392.27,1232.74,0.0 -256,384,7168,8192,37,0,21.6494,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2083.07,1683.1,0.0 -256,384,8192,1024,37,0,7.5727,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,850.75,1410.64,0.0 -256,384,8192,2048,37,0,9.5408,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1350.51,1579.88,0.0 -256,384,8192,3584,46,0,16.2207,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1390.11,1335.31,0.0 -256,384,8192,7168,37,0,19.9731,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2257.89,1853.89,0.0 -256,384,8192,8192,37,0,21.9654,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2346.4,1885.64,0.0 -256,384,8192,28672,37,0,69.2615,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2604.46,1865.93,0.0 -256,384,9216,16384,42,0,44.5812,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2601.19,1922.81,0.0 -256,384,10240,8192,42,0,25.4148,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2534.92,2021.67,0.0 -256,384,12800,5120,38,0,23.8257,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2112.49,1829.18,0.0 -256,384,13312,16384,49,0,57.4449,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2915.9,2131.11,0.0 -256,384,14336,8192,38,0,32.5471,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2771.19,2190.77,0.0 -256,384,16384,2048,49,0,16.6778,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,1545.16,1784.01,0.0 -256,384,16384,4096,38,0,20.4734,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2517.39,2291.94,0.0 -256,384,16384,6656,38,0,29.552,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2834.05,2314.12,0.0 -256,384,16384,8192,38,0,34.6005,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2979.13,2348.66,0.0 -256,384,16384,13312,38,0,53.2326,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,3146.64,2332.98,0.0 -256,384,16384,26624,49,0,95.913,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3492.83,2458.46,0.0 -256,384,26624,16384,50,0,88.0544,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3804.55,2744.86,0.0 -256,384,51200,5120,54,0,72.5351,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2775.57,2362.67,0.0 -256,384,53248,16384,44,0,191.233,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3503.66,2511.32,0.0 -256,384,57344,8192,10,0,105.5947,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,3416.62,2656.33,0.0 -256,416,1280,8192,21,0,15.9344,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,547.5,502.8,0.0 -256,512,512,4096,21,0,7.2567,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,295.93,361.24,0.0 -256,512,512,7168,0,0,23.2885,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,161.37,180.1,0.0 -256,512,800,5120,21,0,38.7761,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,108.17,107.74,0.0 -256,512,1024,3072,21,0,7.2108,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,446.72,472.61,0.0 -256,512,1280,8192,21,0,16.2544,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,660.59,532.21,0.0 -256,512,1536,7168,21,0,12.9046,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,873.66,690.68,0.0 -256,512,1792,7424,21,0,13.7827,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,988.42,753.66,0.0 -256,512,2048,6144,21,0,11.6892,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1102.29,852.19,0.0 -256,512,2304,16384,29,0,30.5848,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1263.85,831.39,0.0 -256,512,2560,8192,29,0,17.993,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1193.51,845.01,0.0 -256,512,3072,1536,0,0,8.7335,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,553.25,675.36,0.0 -256,512,3072,6144,29,0,13.6979,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1410.97,1033.43,0.0 -256,512,4096,4096,29,0,10.0912,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1702.46,1350.83,0.0 -256,512,4096,8192,29,0,16.949,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2027.24,1361.06,0.0 -256,512,4096,14336,29,0,28.388,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2118.13,1311.27,0.0 -256,512,4608,7168,4,0,29.4314,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1149.21,783.81,0.0 -256,512,4608,16384,37,0,35.2309,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2194.36,1324.45,0.0 -256,512,5120,1280,37,0,8.2567,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,812.78,1071.54,0.0 -256,512,5120,5120,37,0,17.3284,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1549.11,1134.6,0.0 -256,512,5120,6400,37,0,18.7412,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1790.41,1241.4,0.0 -256,512,5120,25600,37,0,58.227,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2305.08,1328.12,0.0 -256,512,6144,3072,42,0,12.4658,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1550.43,1324.83,0.0 -256,512,6144,4096,42,0,14.0105,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1839.32,1422.0,0.0 -256,512,6144,12288,42,0,33.2974,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2321.79,1417.11,0.0 -256,512,6144,16384,42,0,42.6302,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2417.99,1426.63,0.0 -256,512,6400,5120,42,0,19.2804,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1740.34,1257.67,0.0 -256,512,7168,2048,1,0,13.0804,a4w4_blockscale_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1149.23,1162.38,0.0 -256,512,7168,2304,8,0,12.9047,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1310.49,1254.38,0.0 -256,512,7168,8192,42,0,24.4684,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2457.44,1585.61,0.0 -256,512,7168,16384,42,0,43.582,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2759.38,1612.01,0.0 -256,512,7168,18432,42,0,49.9132,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2710.53,1565.1,0.0 -256,512,8192,1024,42,0,9.0616,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,947.95,1417.53,0.0 -256,512,8192,2048,46,0,13.7717,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1247.48,1256.31,0.0 -256,512,8192,3584,46,0,16.8791,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1781.18,1421.06,0.0 -256,512,8192,4096,42,0,14.6073,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2352.23,1794.61,0.0 -256,512,8192,7168,42,0,22.9346,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2621.78,1725.94,0.0 -256,512,8192,8192,42,0,25.2603,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2720.45,1743.45,0.0 -256,512,8192,28672,42,0,77.5444,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,3101.68,1717.33,0.0 -256,512,9216,16384,49,0,53.7146,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2878.53,1659.31,0.0 -256,512,10240,8192,38,0,30.4085,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2824.85,1793.12,0.0 -256,512,12288,512,43,0,9.5274,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,676.2,1664.64,0.0 -256,512,12288,1536,8,0,12.1891,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1585.63,1838.8,0.0 -256,512,12288,4096,43,0,20.8203,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2475.45,1863.44,0.0 -256,512,12288,6144,43,0,28.591,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2703.98,1815.41,0.0 -256,512,12800,5120,47,0,26.9162,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2493.25,1753.07,0.0 -256,512,13312,16384,43,0,65.6367,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3402.64,1933.03,0.0 -256,512,14336,8192,43,0,38.0196,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3163.08,1985.75,0.0 -256,512,16384,512,53,0,10.2901,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,834.78,2050.77,0.0 -256,512,16384,2048,43,0,14.9724,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2294.87,2276.1,0.0 -256,512,16384,4096,43,0,22.894,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3001.64,2244.27,0.0 -256,512,16384,6656,43,0,33.6862,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3314.98,2167.27,0.0 -256,512,16384,8192,43,0,38.1581,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3601.83,2253.34,0.0 -256,512,16384,13312,43,0,58.5844,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3812.25,2206.0,0.0 -256,512,16384,16384,43,0,68.3178,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,4023.52,2271.58,0.0 -256,512,16384,26624,43,0,109.2596,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,4088.21,2212.13,0.0 -256,512,16384,53248,43,0,204.9689,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,4358.48,2276.52,0.0 -256,512,18432,7168,50,0,41.6828,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3245.74,2081.67,0.0 -256,512,20480,16384,50,0,86.5746,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3968.8,2228.57,0.0 -256,512,24576,1536,10,0,18.1666,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2127.79,2445.88,0.0 -256,512,26624,16384,54,0,104.3943,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4278.75,2390.56,0.0 -256,512,28672,4096,54,0,36.5438,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3290.82,2438.96,0.0 -256,512,32768,512,54,0,14.2158,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1208.51,2959.67,0.0 -256,512,51200,5120,54,0,79.3531,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3382.8,2328.98,0.0 -256,512,53248,16384,54,0,206.5008,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4326.15,2396.74,0.0 -256,512,57344,8192,54,0,114.8863,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4187.06,2573.84,0.0 -256,512,59136,8192,54,0,120.2706,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4124.61,2534.9,0.0 -256,512,106496,16384,54,0,393.8495,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4536.52,2502.63,0.0 -256,640,256,768,21,0,4.4726,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,56.27,150.19,0.0 -256,640,1280,8192,21,0,16.1806,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,829.5,587.29,0.0 -256,640,2304,16384,29,0,31.0486,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1556.22,871.74,0.0 -256,640,2560,8192,29,0,18.3344,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1464.11,893.62,0.0 -256,640,4608,16384,37,0,37.6937,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2563.74,1297.03,0.0 -256,640,5120,1280,30,0,9.1065,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x256E,921.17,1124.47,0.0 -256,640,5120,5120,42,0,18.2902,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1834.56,1164.51,0.0 -256,640,5120,6400,42,0,20.0998,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2086.74,1243.08,0.0 -256,640,5120,25600,42,0,65.8701,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2547.02,1218.79,0.0 -256,640,6400,5120,42,0,19.3128,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2171.77,1357.36,0.0 -256,640,7168,8192,46,0,28.4912,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2638.08,1444.54,0.0 -256,640,7936,540672,46,0,1458.2397,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,3766.31,1596.83,0.0 -256,640,8192,1024,13,0,9.4548,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,1135.66,1587.31,0.0 -256,640,8192,2048,46,0,15.5769,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1378.63,1253.76,0.0 -256,640,8192,3584,38,0,18.5675,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2024.02,1417.14,0.0 -256,640,8192,7168,46,0,26.2059,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2868.13,1608.02,0.0 -256,640,8192,8192,46,0,29.5225,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2909.62,1580.54,0.0 -256,640,8192,28672,46,0,88.2533,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,3406.65,1553.5,0.0 -256,640,9216,16384,38,0,61.2521,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,3155.38,1510.75,0.0 -256,640,10240,8192,51,0,34.6221,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,3101.32,1665.75,0.0 -256,640,12800,5120,43,0,29.0678,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2885.88,1747.31,0.0 -256,640,13312,16384,47,0,74.153,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3764.82,1771.12,0.0 -256,640,14336,8192,47,0,42.5891,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3529.63,1871.18,0.0 -256,640,16384,2048,47,0,19.1148,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2246.93,2009.13,0.0 -256,640,16384,4096,47,0,26.5896,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3230.56,2099.94,0.0 -256,640,16384,6656,47,0,38.105,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3663.21,2037.2,0.0 -256,640,16384,8192,47,0,45.9881,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3735.72,1972.29,0.0 -256,640,16384,13312,47,0,69.754,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4002.25,1925.1,0.0 -256,640,16384,26624,47,0,119.6467,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4666.62,2069.38,0.0 -256,640,26624,16384,47,0,143.1188,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3901.27,1798.68,0.0 -256,640,51200,5120,45,0,101.0597,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3320.26,1961.68,0.0 -256,640,53248,16384,48,0,279.1174,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4000.79,1825.78,0.0 -256,640,57344,8192,43,0,162.3674,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3703.3,1914.81,0.0 -256,768,1280,8192,21,0,16.8204,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,957.54,615.6,0.0 -256,768,2304,16384,29,0,31.7907,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1823.87,902.93,0.0 -256,768,2560,8192,29,0,18.2695,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1763.17,961.36,0.0 -256,768,4608,16384,42,0,42.9379,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2700.74,1190.51,0.0 -256,768,5120,1280,30,0,9.2191,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x256E,1091.9,1261.8,0.0 -256,768,5120,5120,42,0,18.7867,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2143.29,1220.95,0.0 -256,768,5120,6400,42,0,20.7759,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2422.6,1285.43,0.0 -256,768,5120,25600,42,0,67.9876,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2961.23,1224.2,0.0 -256,768,6400,5120,46,0,21.2145,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2372.51,1328.36,0.0 -256,768,7168,8192,49,0,31.7305,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2842.51,1371.42,0.0 -256,768,8192,1024,13,0,11.9823,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,1075.33,1432.98,0.0 -256,768,8192,2048,49,0,16.0083,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,1609.78,1359.17,0.0 -256,768,8192,3584,49,0,18.1697,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2482.0,1576.21,0.0 -256,768,8192,7168,49,0,29.5435,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3052.93,1512.87,0.0 -256,768,8192,8192,49,0,32.5122,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3170.48,1515.83,0.0 -256,768,8192,28672,49,0,97.7048,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3692.52,1443.47,0.0 -256,768,9216,16384,43,0,65.8922,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3519.81,1456.09,0.0 -256,768,10240,8192,43,0,37.1829,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3465.28,1635.63,0.0 -256,768,12800,5120,47,0,32.7883,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3070.1,1658.97,0.0 -256,768,13312,16384,52,0,85.5076,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3917.87,1588.05,0.0 -256,768,14336,8192,50,0,49.5292,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3642.07,1693.67,0.0 -256,768,16384,2048,50,0,20.4289,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2522.88,2091.62,0.0 -256,768,16384,4096,50,0,30.2755,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3404.71,1991.48,0.0 -256,768,16384,6656,50,0,45.6941,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3665.76,1799.96,0.0 -256,768,16384,8192,50,0,52.9874,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3890.71,1800.81,0.0 -256,768,16384,13312,52,0,77.5727,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4318.63,1796.12,0.0 -256,768,16384,26624,50,0,138.9553,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4821.8,1824.28,0.0 -256,768,26624,16384,50,0,165.2907,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4053.55,1604.99,0.0 -256,768,51200,5120,54,0,111.9765,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3595.87,1890.41,0.0 -256,768,53248,16384,54,0,289.4301,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4629.89,1811.45,0.0 -256,768,57344,8192,54,0,167.4114,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4310.07,1947.94,0.0 -256,768,547328,2048,54,0,526.8722,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3267.86,2660.88,0.0 -256,832,1280,8192,29,0,18.4303,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,946.72,584.94,0.0 -256,864,1280,8192,29,0,18.3432,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,987.8,599.33,0.0 -256,896,13184,53504,44,0,276.8915,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4565.22,1445.67,0.0 -256,1024,512,4096,21,0,9.3769,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,458.04,447.3,0.0 -256,1024,512,7168,0,0,24.5597,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,306.04,266.84,0.0 -256,1024,800,5120,23,0,46.2669,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,181.31,136.34,0.0 -256,1024,1024,3072,21,0,9.0159,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,714.57,581.51,0.0 -256,1024,1280,8192,29,0,18.2932,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1173.92,659.19,0.0 -256,1024,1536,7168,29,0,15.7978,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1427.32,779.9,0.0 -256,1024,2048,6144,29,0,14.261,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1807.01,955.86,0.0 -256,1024,2304,16384,37,0,36.3864,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2124.68,878.94,0.0 -256,1024,2560,8192,37,0,21.1445,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2031.25,942.23,0.0 -256,1024,3072,1536,8,0,10.7099,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,902.31,881.16,0.0 -256,1024,3072,6144,42,0,19.4604,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1986.33,969.89,0.0 -256,1024,4096,4096,42,0,14.6518,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2345.09,1288.19,0.0 -256,1024,4096,8192,42,0,24.7357,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2778.15,1186.95,0.0 -256,1024,4096,14336,42,0,41.0531,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2929.35,1098.3,0.0 -256,1024,4608,7168,5,0,42.7675,a4w4_blockscale_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1581.71,692.64,0.0 -256,1024,4608,16384,46,0,49.665,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,3113.24,1118.99,0.0 -256,1024,5120,1280,38,0,10.8223,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,1240.2,1332.24,0.0 -256,1024,5120,5120,49,0,23.1664,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2317.46,1131.57,0.0 -256,1024,5120,6400,49,0,25.3882,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2643.31,1187.42,0.0 -256,1024,5120,25600,49,0,84.2335,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3186.8,1058.12,0.0 -256,1024,6144,3072,51,0,17.8126,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2170.08,1324.51,0.0 -256,1024,6144,4096,51,0,20.4317,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2522.53,1334.35,0.0 -256,1024,6144,12288,51,0,49.6008,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,3117.26,1141.58,0.0 -256,1024,6144,16384,43,0,61.8487,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3333.27,1152.86,0.0 -256,1024,6400,5120,51,0,26.509,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2531.55,1211.39,0.0 -256,1024,7168,2048,8,0,19.98,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1504.74,1154.59,0.0 -256,1024,7168,2304,9,0,22.1345,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1528.06,1089.58,0.0 -256,1024,7168,8192,43,0,36.9755,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3252.4,1304.5,0.0 -256,1024,7168,16384,43,0,67.0567,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3586.79,1219.7,0.0 -256,1024,7168,18432,53,0,75.1228,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,3601.88,1200.4,0.0 -256,1024,8192,1024,8,0,14.3455,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1197.58,1498.44,0.0 -256,1024,8192,2048,43,0,18.0551,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,1903.05,1451.91,0.0 -256,1024,8192,3584,43,0,21.7992,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2758.34,1527.23,0.0 -256,1024,8192,4096,43,0,21.6498,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3174.14,1646.74,0.0 -256,1024,8192,7168,43,0,35.1762,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3418.76,1415.94,0.0 -256,1024,8192,8192,53,0,38.144,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,3603.16,1429.48,0.0 -256,1024,8192,28672,43,0,111.089,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,4330.19,1340.35,0.0 -256,1024,9216,16384,47,0,73.6628,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4198.02,1395.01,0.0 -256,1024,10240,8192,50,0,49.6385,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3461.0,1351.95,0.0 -256,1024,10880,28416,54,0,149.9933,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4221.34,1276.15,0.0004 -256,1024,12288,512,44,0,12.3061,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,1047.03,2321.91,0.0 -256,1024,12288,1536,44,0,17.9317,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,2155.66,1973.57,0.0 -256,1024,12288,4096,44,0,31.5117,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3271.14,1663.79,0.0 -256,1024,12288,6144,44,0,43.5271,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3552.24,1517.68,0.0 -256,1024,12800,5120,48,0,40.8113,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3288.74,1509.48,0.0 -256,1024,13312,16384,54,0,97.7036,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4571.75,1481.05,0.0 -256,1024,14336,8192,54,0,60.3453,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3985.7,1529.11,0.0 -256,1024,16384,512,45,0,14.463,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1187.85,2628.15,0.0 -256,1024,16384,2048,54,0,21.8036,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3151.75,2356.5,0.0 -256,1024,16384,4096,54,0,34.1505,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4024.51,2026.5,0.0 -256,1024,16384,6656,54,0,52.7494,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4233.95,1734.39,0.0 -256,1024,16384,8192,54,0,59.9643,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4584.03,1748.67,0.0 -256,1024,16384,13312,54,0,91.4984,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4881.8,1633.06,0.0 -256,1024,16384,16384,54,0,111.7138,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4921.11,1576.89,0.0 -256,1024,16384,26624,54,0,163.6572,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5458.69,1621.01,0.0 -256,1024,16384,53248,54,0,330.6898,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5402.97,1502.99,0.0 -256,1024,18432,7168,47,0,75.4356,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3586.94,1424.78,0.0 -256,1024,20480,16384,50,0,165.8132,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4144.39,1315.36,0.0 -256,1024,24576,1536,54,0,34.8437,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2218.75,2008.75,0.0 -256,1024,26624,16384,54,0,189.7626,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4707.74,1480.89,0.0 -256,1024,28672,4096,54,0,67.7295,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3551.16,1764.93,0.0 -256,1024,32768,512,54,0,25.3249,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1356.76,2991.51,0.0 -256,1024,51200,5120,48,0,143.817,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3733.01,1658.71,0.0 -256,1024,53248,16384,54,0,375.89,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4753.27,1472.9,0.0 -256,1024,57344,8192,54,0,212.8558,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4519.83,1674.92,0.0 -256,1024,59136,8192,54,0,219.812,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4513.57,1672.0,0.0 -256,1024,106496,16384,54,0,695.1664,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5140.37,1580.78,0.0 -256,1152,1280,8192,29,0,18.0,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1342.18,717.26,0.0 -256,1152,2304,16384,37,0,38.6497,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2250.29,869.86,0.0 -256,1152,2560,8192,37,0,21.6324,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2233.61,975.51,0.0 -256,1152,4608,16384,49,0,54.2861,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3204.25,1064.78,0.0 -256,1152,5120,1280,38,0,13.5937,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,1110.77,1163.08,0.0 -256,1152,5120,5120,38,0,23.5739,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2562.07,1181.51,0.0 -256,1152,5120,6400,49,0,26.6498,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2832.95,1195.76,0.0 -256,1152,5120,25600,49,0,85.8163,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3519.03,1072.97,0.0 -256,1152,6400,5120,53,0,29.0565,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,2598.3,1172.84,0.0 -256,1152,7168,8192,43,0,39.4724,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3427.5,1281.75,0.0 -256,1152,8192,1024,47,0,14.4799,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,1334.77,1633.89,0.0 -256,1152,8192,2048,47,0,17.1527,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2253.56,1658.2,0.0 -256,1152,8192,3584,47,0,23.2214,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2913.08,1533.88,0.0 -256,1152,8192,7168,47,0,38.4727,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3516.56,1361.05,0.0 -256,1152,8192,8192,47,0,41.1237,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3759.85,1389.65,0.0 -256,1152,8192,28672,47,0,121.8771,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4440.26,1253.97,0.0 -256,1152,9216,16384,50,0,85.6876,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4060.01,1239.02,0.0 -256,1152,10240,8192,50,0,50.9078,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3796.54,1380.04,0.0 -256,1152,12800,5120,54,0,43.4897,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3471.97,1499.4,0.0 -256,1152,13312,16384,45,0,119.9853,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4188.11,1243.15,0.0 -256,1152,14336,8192,45,0,66.8811,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4045.73,1442.4,0.0 -256,1152,16384,2048,47,0,30.2016,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2559.78,1844.46,0.0 -256,1152,16384,4096,47,0,46.9601,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3292.56,1568.62,0.0 -256,1152,16384,6656,47,0,69.2029,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3630.71,1388.79,0.0 -256,1152,16384,8192,47,0,81.3708,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3800.35,1346.63,0.0 -256,1152,16384,13312,47,0,123.7448,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4060.87,1248.28,0.0 -256,1152,16384,26624,47,0,232.4817,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4323.02,1166.49,0.0 -256,1152,26624,16384,45,0,226.9321,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4428.74,1272.99,0.0 -256,1152,51200,5120,54,0,155.4048,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3886.49,1621.48,0.0 -256,1152,53248,16384,54,0,451.0597,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4456.27,1259.98,0.0 -256,1152,57344,8192,54,0,254.8822,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4246.4,1458.4,0.0 -256,1536,512,7168,0,0,25.5821,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,440.71,348.4,0.0 -256,1536,1280,8192,29,0,17.9019,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1799.38,863.96,0.0 -256,1536,1536,7168,4,0,29.9466,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1129.44,525.22,0.0 -256,1536,2304,16384,42,0,45.0414,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2574.61,855.55,0.0 -256,1536,2560,8192,42,0,25.354,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2541.0,971.9,0.0 -256,1536,3072,1536,4,0,12.9268,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1121.35,1003.82,0.0 -256,1536,3200,10496,46,0,37.9321,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2720.12,914.4,0.0 -256,1536,3456,517376,38,0,1592.2596,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,3449.75,817.7,0.0 -256,1536,4608,7168,6,0,54.4245,a4w4_blockscale_256x96x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1864.39,664.7,0.0 -256,1536,4608,16384,51,0,65.3407,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,3549.52,986.94,0.0 -256,1536,5120,1280,43,0,15.6453,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,1286.82,1277.6,0.0 -256,1536,5120,5120,43,0,28.2912,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2846.49,1158.24,0.0 -256,1536,5120,6400,43,0,31.8861,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3156.96,1161.25,0.0 -256,1536,5120,25600,43,0,100.8438,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3992.84,1000.81,0.0 -256,1536,6400,5120,47,0,33.9068,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2968.82,1179.02,0.0 -256,1536,7168,2048,10,0,26.9209,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1675.17,1149.03,0.0 -256,1536,7168,2304,7,0,29.4102,a4w4_blockscale_256x96x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1725.06,1089.66,0.0 -256,1536,7168,8192,50,0,48.3249,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3732.83,1193.42,0.0 -256,1536,8192,1024,45,0,17.5535,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1468.07,1717.41,0.0 -256,1536,8192,2048,50,0,20.3498,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2532.68,1726.17,0.0 -256,1536,8192,3584,54,0,28.3942,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3176.5,1500.25,0.0 -256,1536,8192,7168,50,0,46.6282,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3868.66,1287.44,0.0 -256,1536,8192,8192,52,0,53.7063,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3838.63,1210.5,0.0 -256,1536,8192,28672,50,0,142.6752,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,5057.32,1153.85,0.0 -256,1536,9216,16384,54,0,101.515,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4569.34,1146.55,0.0 -256,1536,10240,8192,54,0,59.7295,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4314.42,1334.21,0.0 -256,1536,12800,5120,47,0,61.135,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3293.15,1243.51,0.0 -256,1536,13312,16384,50,0,157.0373,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4266.6,1034.97,0.0 -256,1536,14336,8192,54,0,93.3554,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3864.56,1168.14,0.0 -256,1536,16384,2048,54,0,36.3137,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2838.58,1891.34,0.0 -256,1536,16384,4096,54,0,58.7302,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3510.26,1481.89,0.0 -256,1536,16384,6656,50,0,83.9826,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3989.01,1309.43,0.0 -256,1536,16384,8192,50,0,99.0521,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4162.63,1249.16,0.0 -256,1536,16384,13312,50,0,143.3822,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4672.93,1182.9,0.0 -256,1536,16384,26624,50,0,259.8596,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,5156.75,1111.69,0.0 -256,1536,26624,16384,54,0,280.4802,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4777.63,1114.07,0.0 -256,1536,51200,5120,54,0,185.6814,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4337.03,1574.15,0.0 -256,1536,53248,16384,54,0,505.4454,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5302.37,1211.54,0.0 -256,1536,57344,8192,54,0,323.7218,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4457.87,1289.17,0.0 -256,1552,57344,8192,21,0,402.0203,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3627.03,1042.82,0.0 -256,1600,1280,8192,29,0,18.8692,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1778.26,842.24,0.0 -256,1600,2304,16384,42,0,45.0896,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2679.02,872.8,0.0 -256,1600,2560,8192,46,0,27.4374,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2445.89,919.6,0.0 -256,1600,4608,16384,43,0,67.3316,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3588.09,974.31,0.0 -256,1600,5120,1280,39,0,15.9852,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x384E,1311.93,1294.0,0.0 -256,1600,5120,5120,47,0,30.2039,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2777.33,1112.02,0.0 -256,1600,5120,6400,47,0,34.0818,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3076.65,1111.68,0.0 -256,1600,5120,25600,47,0,106.7869,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3927.73,958.92,0.0 -256,1600,6400,5120,47,0,35.922,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2919.04,1140.25,0.0 -256,1600,7168,8192,50,0,52.2949,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3593.18,1125.37,0.0 -256,1600,8192,1024,45,0,17.4726,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1536.32,1787.25,0.0 -256,1600,8192,2048,54,0,20.5068,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2618.01,1767.29,0.0 -256,1600,8192,3584,54,0,29.2052,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3216.98,1498.42,0.0 -256,1600,8192,7168,52,0,51.3594,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3658.63,1193.72,0.0 -256,1600,8192,8192,52,0,57.7062,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3721.41,1149.31,0.0 -256,1600,8192,28672,54,0,161.0625,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4666.63,1034.33,0.0 -256,1600,9216,16384,48,0,105.4034,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4584.14,1120.42,0.0 -256,1600,10240,8192,47,0,78.9556,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3399.83,1029.24,0.0 -256,1600,12800,5120,47,0,63.5804,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3298.43,1224.03,0.0 -256,1600,13312,16384,50,0,168.8251,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4134.05,975.91,0.0 -256,1600,14336,8192,50,0,98.1944,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3827.2,1131.93,0.0 -256,1600,16384,2048,54,0,37.0677,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2896.7,1911.22,0.0 -256,1600,16384,4096,54,0,62.2458,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3450.01,1433.99,0.0 -256,1600,16384,6656,54,0,91.2148,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3825.76,1230.94,0.0 -256,1600,16384,8192,54,0,102.7525,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4179.92,1227.14,0.0 -256,1600,16384,13312,54,0,155.5977,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4485.49,1106.25,0.0 -256,1600,16384,26624,54,0,292.3706,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4774.3,998.16,0.0 -256,1600,26624,16384,48,0,290.0493,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4812.51,1090.88,0.0 -256,1600,51200,5120,54,0,209.092,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4011.92,1430.03,0.0 -256,1600,53248,16384,54,0,576.4818,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4842.7,1074.98,0.0 -256,1600,57344,8192,48,0,336.8595,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4462.51,1261.46,0.0 -256,1664,57344,8192,21,0,410.6409,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3807.14,1053.32,0.0 -256,1792,5376,4096,47,0,27.5359,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2866.08,1232.85,0.0 -256,1792,57344,8192,26,0,432.9381,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3888.84,1034.19,0.0 -256,2048,512,4096,21,0,10.1832,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,843.54,720.8,0.0 -256,2048,512,7168,0,0,26.4625,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,568.06,425.97,0.0 -256,2048,1024,3072,29,0,10.2991,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1251.07,865.41,0.0 -256,2048,1280,8192,37,0,21.633,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1985.38,872.48,0.0 -256,2048,1536,7168,42,0,22.528,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2001.83,849.45,0.0 -256,2048,2048,6144,42,0,20.8673,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2469.87,1004.99,0.0 -256,2048,2304,16384,46,0,51.7314,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2988.88,871.59,0.0 -256,2048,2560,8192,38,0,31.454,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2730.95,933.43,0.0 -256,2048,3072,1536,9,0,16.228,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1190.99,1017.69,0.0 -256,2048,3072,6144,51,0,28.7334,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2690.58,985.32,0.0 -256,2048,4096,4096,43,0,22.0307,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3119.26,1332.69,0.0 -256,2048,4096,8192,43,0,38.7963,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3542.58,1081.11,0.0 -256,2048,4096,14336,53,0,64.4578,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,3731.41,943.52,0.0 -256,2048,4608,7168,10,0,66.6546,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2029.74,641.06,0.0 -256,2048,4608,16384,47,0,77.7096,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3979.4,944.55,0.0 -256,2048,5120,1280,10,0,17.0408,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1575.25,1499.87,0.0 -256,2048,5120,5120,50,0,34.7884,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3086.49,1130.31,0.0 -256,2048,5120,6400,50,0,39.2433,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3420.14,1118.89,0.0 -256,2048,5120,25600,50,0,127.207,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4220.45,886.13,0.0 -256,2048,6144,3072,54,0,26.6042,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2905.91,1418.9,0.0 -256,2048,6144,4096,44,0,31.413,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3281.42,1335.21,0.0 -256,2048,6144,4608,44,0,34.4644,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3364.75,1277.85,0.0 -256,2048,6144,12288,44,0,76.3642,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4049.51,988.65,0.0 -256,2048,6144,16384,44,0,95.5177,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4316.65,966.05,0.0 -256,2048,6400,5120,54,0,40.6593,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3301.03,1176.64,0.0 -256,2048,7168,2048,9,0,40.0639,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1500.84,968.39,0.0 -256,2048,7168,2304,9,0,42.4075,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1595.14,942.69,0.0 -256,2048,7168,8192,54,0,57.2065,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4204.39,1173.1,0.0 -256,2048,7168,16384,54,0,105.1191,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4576.11,997.51,0.0 -256,2048,7168,18432,54,0,118.8045,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4555.1,962.04,0.0 -256,2048,8192,1024,54,0,18.164,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1891.64,2135.95,0.0 -256,2048,8192,2048,54,0,22.5974,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3041.03,1948.91,0.0 -256,2048,8192,3584,54,0,32.4536,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3705.57,1599.35,0.0 -256,2048,8192,4096,54,0,37.1442,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3700.15,1467.95,0.0 -256,2048,8192,7168,54,0,55.2862,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4350.42,1270.74,0.0 -256,2048,8192,8192,54,0,60.7364,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4525.75,1243.04,0.0 -256,2048,8192,28672,54,0,181.0399,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5314.15,996.22,0.0 -256,2048,9216,16384,47,0,144.1381,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4290.85,902.08,0.0 -256,2048,10240,8192,50,0,93.3564,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3680.49,988.41,0.0 -256,2048,12288,512,34,0,22.2856,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x768E,1156.34,2423.16,0.0 -256,2048,12288,1536,54,0,35.0292,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2207.0,1751.16,0.0 -256,2048,12288,4096,44,0,60.5778,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3403.2,1315.53,0.0 -256,2048,12288,6144,44,0,83.0992,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3721.31,1135.65,0.0 -256,2048,12800,5120,54,0,77.0353,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3484.58,1174.0,0.0 -256,2048,13312,16384,54,0,181.8868,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4911.59,991.58,0.0 -256,2048,14336,8192,54,0,111.0807,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4330.51,1132.77,0.0 -256,2048,16384,512,54,0,23.7227,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1448.39,3027.79,0.0 -256,2048,16384,2048,54,0,42.9001,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3203.7,2004.27,0.0 -256,2048,16384,4096,54,0,65.2327,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4213.81,1607.44,0.0 -256,2048,16384,6656,54,0,94.9655,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4703.57,1352.6,0.0 -256,2048,16384,8192,54,0,118.9261,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4622.67,1199.12,0.0 -256,2048,16384,13312,54,0,169.6998,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5264.31,1118.4,0.0 -256,2048,16384,16384,54,0,216.0855,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5088.32,1009.34,0.0 -256,2048,16384,26624,54,0,319.7683,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5587.5,977.19,0.0 -256,2048,16384,53248,54,0,619.7511,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5765.88,900.11,0.0 -256,2048,18432,7168,48,0,134.1723,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4033.37,1109.75,0.0 -256,2048,20480,16384,54,0,281.7409,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4878.2,952.77,0.0 -256,2048,24576,1536,54,0,56.4026,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2741.34,2147.25,0.0 -256,2048,26624,16384,54,0,372.6865,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4794.13,922.85,0.0 -256,2048,28672,4096,54,0,123.3527,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3899.68,1462.11,0.0 -256,2048,32768,512,36,0,48.8553,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_64x1024E,1406.59,2929.68,0.0 -256,2048,51200,5120,54,0,256.484,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4186.39,1349.13,0.0 -256,2048,53248,16384,54,0,682.467,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5236.02,983.33,0.0 -256,2048,57344,8192,54,0,393.2207,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4893.3,1215.99,0.0 -256,2048,59136,8192,54,0,423.1209,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4689.62,1164.75,0.0 -256,2048,106496,16384,54,0,1308.8308,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5460.47,1012.66,0.0 -256,2432,3584,738560,47,0,2828.8778,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4551.28,791.49,0.0 -256,2880,1280,8192,42,0,25.7621,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2344.45,947.6,0.0 -256,2880,2304,16384,51,0,63.6883,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,3414.01,875.17,0.0 -256,2880,2560,8192,43,0,36.4457,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3314.41,1015.97,0.0 -256,2880,4608,16384,52,0,96.5076,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4506.02,910.64,0.0 -256,2880,5120,1280,54,0,17.8654,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2112.95,1937.33,0.0 -256,2880,5120,5120,48,0,42.4121,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3560.19,1178.23,0.0 -256,2880,5120,6400,54,0,48.4765,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3893.51,1136.45,0.0 -256,2880,5120,25600,54,0,153.855,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4907.05,857.24,0.0 -256,2880,6400,5120,54,0,65.797,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2868.58,921.33,0.0 -256,2880,7168,8192,47,0,87.9683,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3844.89,937.2,0.0 -256,2880,8192,1024,40,0,27.0418,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x512E,1786.8,1954.56,0.0 -256,2880,8192,2048,50,0,38.258,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2525.92,1529.71,0.0 -256,2880,8192,3584,50,0,56.3354,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3001.92,1189.78,0.0 -256,2880,8192,7168,50,0,87.1289,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3881.93,997.01,0.0 -256,2880,8192,8192,50,0,96.8561,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3990.94,955.41,0.0 -256,2880,8192,28672,50,0,274.6138,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4926.61,749.83,0.0 -256,2880,9216,16384,52,0,185.5243,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4687.96,820.24,0.0 -256,2880,10240,8192,54,0,117.3484,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4117.52,960.57,0.0 -256,2880,12800,5120,54,0,106.5865,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3541.61,1068.32,0.0 -256,2880,13312,16384,52,0,269.9373,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4653.96,775.45,0.0 -256,2880,14336,8192,54,0,155.6029,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4347.33,983.86,0.0 -256,2880,16384,2048,54,0,61.497,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3142.81,1855.35,0.0 -256,2880,16384,4096,54,0,95.895,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4030.94,1395.53,0.0 -256,2880,16384,6656,54,0,138.9253,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4521.42,1140.77,0.0 -256,2880,16384,8192,54,0,165.6418,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4667.26,1046.1,0.0 -256,2880,16384,13312,54,0,247.3824,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5078.28,899.79,0.0 -256,2880,16384,26624,54,0,473.6679,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5304.47,740.63,0.0 -256,2880,26624,16384,48,0,523.2043,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4802.25,755.06,0.0 -256,2880,51200,5120,54,0,344.4229,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4384.0,1258.21,0.0 -256,2880,53248,16384,54,0,961.9976,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5223.62,796.79,0.0 -256,2880,57344,8192,54,0,567.131,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4771.08,1017.36,0.0 -256,2944,1792,8192,38,0,32.608,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2650.77,918.48,0.0 -256,3000,512,7168,29,0,16.2854,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1352.14,961.54,0.0 -256,3000,2112,7168,51,0,33.1164,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2742.84,935.89,0.0 -256,3000,3072,1536,47,0,15.3407,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,1845.52,1505.49,0.0 -256,3000,7168,256,40,0,17.4695,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x512E,630.24,2536.39,0.0 -256,3000,7168,2048,50,0,37.8602,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2326.46,1410.98,0.0 -256,3072,57344,8192,21,0,735.2449,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3925.52,815.76,0.0 -256,3200,1280,8192,42,0,26.6539,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2517.79,995.8,0.0 -256,3200,2304,16384,43,0,69.1244,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3495.03,865.6,0.0 -256,3200,2560,8192,43,0,39.9461,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3359.97,1000.77,0.0 -256,3200,4608,16384,54,0,106.0754,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4555.1,881.02,0.0 -256,3200,5120,1280,45,0,19.7857,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2119.87,1925.27,0.0 -256,3200,5120,5120,45,0,47.9925,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3495.8,1126.58,0.0 -256,3200,5120,6400,45,0,54.7726,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3828.83,1084.34,0.0 -256,3200,5120,25600,45,0,176.123,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4762.93,790.72,0.0 -256,3200,6400,5120,47,0,69.1363,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3033.36,947.92,0.0 -256,3200,7168,8192,50,0,97.6826,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3847.25,904.38,0.0 -256,3200,8192,1024,54,0,28.482,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1884.95,2045.56,0.0 -256,3200,8192,2048,54,0,40.9605,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2621.41,1564.78,0.0 -256,3200,8192,3584,54,0,57.3302,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3277.59,1270.59,0.0 -256,3200,8192,7168,54,0,94.5081,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3976.48,986.77,0.0 -256,3200,8192,8192,54,0,104.4674,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4111.3,948.53,0.0 -256,3200,8192,28672,52,0,305.1042,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4926.97,707.12,0.0 -256,3200,9216,16384,54,0,197.1671,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4901.26,815.02,0.0 -256,3200,10240,8192,45,0,122.2919,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4390.08,986.05,0.0 -256,3200,12800,5120,54,0,109.5218,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3829.65,1121.97,0.0 -256,3200,13312,16384,54,0,287.2469,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4859.46,767.5,0.0 -256,3200,14336,8192,54,0,162.3098,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4630.77,1007.81,0.0 -256,3200,16384,2048,54,0,71.5292,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3002.25,1746.3,0.0 -256,3200,16384,4096,54,0,111.6874,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3845.53,1297.96,0.0 -256,3200,16384,6656,54,0,163.4773,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4269.29,1040.1,0.0 -256,3200,16384,8192,47,0,196.8084,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4364.62,940.37,0.0 -256,3200,16384,13312,47,0,292.815,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4767.05,803.27,0.0 -256,3200,16384,26624,47,0,576.3884,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4843.49,634.22,0.0 -256,3200,26624,16384,54,0,556.8581,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5013.36,744.74,0.0 -256,3200,51200,5120,54,0,386.9368,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4335.91,1206.77,0.0 -256,3200,53248,16384,48,0,1128.8789,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4946.02,711.51,0.0 -256,3200,57344,8192,54,0,649.529,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4628.7,946.82,0.0 -256,3712,951552,4352,54,0,6920.3238,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4442.56,1321.18,0.0 -256,4096,512,4096,29,0,10.9658,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1566.68,1243.09,0.0 -256,4096,512,7168,0,0,27.3037,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1101.12,758.48,0.0 -256,4096,800,5120,21,0,39.6258,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,846.78,481.69,0.0 -256,4096,1024,3072,30,0,14.0452,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x256E,1834.78,1157.19,0.0 -256,4096,1280,8192,38,0,32.2517,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2663.41,1007.88,0.0 -256,4096,1536,7168,51,0,33.603,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2684.11,975.15,0.0 -256,4096,2048,6144,43,0,33.101,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3114.08,1077.05,0.0 -256,4096,2304,16384,47,0,78.0812,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3960.46,913.19,0.0 -256,4096,2560,8192,50,0,49.3258,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3482.94,977.88,0.0 -256,4096,3072,1536,10,0,28.6765,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1347.96,1069.55,0.0 -256,4096,3072,6144,44,0,44.4912,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3475.27,1060.57,0.0 -256,4096,4096,4096,54,0,37.9223,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3624.23,1327.23,0.0 -256,4096,4096,8192,54,0,63.8634,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4304.15,1050.82,0.0 -256,4096,4096,14336,54,0,104.59,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4599.26,882.25,0.0 -256,4096,4608,7168,10,0,131.1566,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2063.05,525.66,0.0 -256,4096,4608,16384,47,0,144.7034,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4274.09,753.62,0.0 -256,4096,5120,1280,33,0,30.2367,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x640E,1775.56,1582.23,0.0 -256,4096,5120,5120,50,0,67.9041,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3162.52,965.13,0.0 -256,4096,5120,6400,50,0,77.6951,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3454.99,919.42,0.0 -256,4096,5120,25600,50,0,238.1354,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4508.96,671.5,0.0 -256,4096,6144,3072,44,0,52.4445,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,2948.24,1259.62,0.0 -256,4096,6144,4096,44,0,61.9246,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3329.18,1151.45,0.0 -256,4096,6144,12288,44,0,144.5997,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4277.15,783.17,0.0 -256,4096,6144,16384,44,0,177.7384,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4639.59,755.14,0.0 -256,4096,6400,5120,54,0,81.4407,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3296.08,973.7,0.0 -256,4096,7168,2048,10,0,77.5343,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1551.04,906.11,0.0 -256,4096,7168,2304,5,0,80.8616,a4w4_blockscale_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1673.12,886.66,0.0 -256,4096,7168,8192,54,0,110.3777,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4358.09,949.99,0.0 -256,4096,7168,16384,54,0,197.1971,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4878.74,765.71,0.0 -256,4096,7168,18432,54,0,218.7404,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4948.02,743.02,0.0 -256,4096,8192,1024,54,0,30.4329,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2258.07,2411.87,0.0 -256,4096,8192,2048,54,0,43.3485,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3170.56,1838.4,0.0 -256,4096,8192,3584,54,0,62.8575,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3826.4,1417.95,0.0 -256,4096,8192,4096,54,0,68.705,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4000.84,1343.06,0.0 -256,4096,8192,7168,54,0,104.1747,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4617.59,1066.95,0.0 -256,4096,8192,8192,54,0,116.6368,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4713.4,1006.89,0.0 -256,4096,8192,28672,54,0,345.2111,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5573.82,704.7,0.0 -256,4096,9216,16384,44,0,253.5015,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4879.46,728.0,0.0 -256,4096,10240,8192,54,0,156.2228,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4398.81,912.84,0.0 -256,4096,12288,512,54,0,34.8007,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1480.99,3013.09,0.0 -256,4096,12288,1536,54,0,57.2009,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2703.08,1979.8,0.0 -256,4096,12288,4096,54,0,101.7739,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4051.3,1318.78,0.0 -256,4096,12288,6144,54,0,138.7122,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4458.69,1088.55,0.0 -256,4096,12800,5120,54,0,139.4543,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3849.8,1062.08,0.0 -256,4096,13312,16384,54,0,367.441,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4862.57,684.89,0.0 -256,4096,14336,8192,54,0,207.3872,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4639.02,930.33,0.0 -256,4096,16384,512,54,0,45.8036,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1500.31,3044.75,0.0 -256,4096,16384,2048,54,0,82.564,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3329.27,1879.62,0.0 -256,4096,16384,4096,45,0,129.4044,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4248.35,1361.32,0.0 -256,4096,16384,6656,54,0,186.3419,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4794.16,1086.04,0.0 -256,4096,16384,8192,54,0,215.1571,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5110.27,1013.7,0.0 -256,4096,16384,13312,54,0,335.7525,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5321.5,805.75,0.0 -256,4096,16384,16384,54,0,394.8506,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5569.25,764.82,0.0 -256,4096,16384,26624,54,0,626.5094,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5703.69,649.39,0.0 -256,4096,16384,53248,54,0,1240.1583,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5762.83,547.9,0.0 -256,4096,18432,7168,54,0,235.4389,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4597.08,984.27,0.0 -256,4096,20480,16384,54,0,506.1778,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5430.46,729.19,0.0 -256,4096,24576,1536,45,0,106.1324,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2913.7,2104.42,0.0 -256,4096,26624,16384,54,0,656.538,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5442.81,715.51,0.0 -256,4096,28672,4096,54,0,220.5013,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4363.12,1369.56,0.0 -256,4096,32768,512,36,0,90.4435,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_64x1024E,1519.61,3072.33,0.0 -256,4096,51200,5120,54,0,478.3978,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4488.91,1172.64,0.0 -256,4096,53248,16384,54,0,1300.5576,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5495.2,696.6,0.0 -256,4096,57344,8192,54,0,744.0942,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5171.78,969.53,0.0 -256,4096,59136,8192,54,0,791.8357,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5011.83,938.88,0.0 -256,4096,106496,16384,54,0,2568.3523,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5565.3,692.42,0.0 -256,4224,4096,768,38,0,19.9861,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,1329.68,1891.21,0.0 -256,4736,44416,17920,54,0,1389.8884,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5424.24,619.55,0.0029 -256,4992,4864,7168,50,0,96.904,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3592.15,865.66,0.0 -256,5120,14336,8192,54,0,256.5179,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4688.14,882.95,0.0 -256,5504,6144,5376,50,0,102.785,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3537.44,962.62,0.0 -256,6016,4864,7168,54,0,106.812,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3927.44,912.98,0.0 -256,6144,14336,8192,54,0,308.834,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4672.77,842.03,0.0 -256,6272,2688,68096,47,0,535.6359,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4286.64,632.5,0.0 -256,6528,7040,5632,54,0,128.2402,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4036.65,1014.67,0.0088 -256,6656,2560,3328,47,0,44.3137,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2559.34,1115.1,0.0 -256,7040,896,1280,43,0,11.9928,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,1346.48,1475.45,0.0 -256,7168,6272,5888,54,0,139.2251,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3802.63,930.02,0.0019 -256,7680,14336,8192,54,0,371.2804,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4858.56,835.97,0.0 -256,7808,5120,7168,54,0,141.1399,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4060.58,894.77,0.0 -256,7808,7040,5376,54,0,150.5516,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3925.69,995.33,0.0112 -256,8192,512,4096,42,0,16.4152,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2093.17,1596.96,0.0 -256,8192,512,7168,1,0,37.1971,a4w4_blockscale_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1616.51,1064.16,0.0 -256,8192,800,5120,21,0,41.7607,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1606.99,865.09,0.0 -256,8192,1024,3072,43,0,20.267,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2543.03,1526.27,0.0 -256,8192,1280,8192,50,0,51.423,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3340.89,1162.3,0.0 -256,8192,1536,7168,44,0,48.906,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3688.48,1227.48,0.0 -256,8192,2048,6144,54,0,55.3677,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3723.44,1174.18,0.0 -256,8192,2304,16384,47,0,151.5138,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4081.97,816.64,0.0 -256,8192,2560,8192,50,0,96.5366,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3559.24,890.68,0.0 -256,8192,3072,1536,10,0,50.2556,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1538.32,1173.65,0.0 -256,8192,3072,6144,44,0,86.0465,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3593.84,987.08,0.0 -256,8192,4096,4096,54,0,70.1619,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3917.77,1315.17,0.0 -256,8192,4096,8192,54,0,120.7533,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4552.72,972.57,0.0 -256,8192,4096,14336,54,0,189.8393,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5067.83,817.48,0.0 -256,8192,4608,7168,10,0,227.7773,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2375.86,532.86,0.0 -256,8192,4608,16384,44,0,262.9408,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4704.29,685.92,0.0 -256,8192,5120,1280,45,0,49.1678,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2183.83,1879.4,0.0 -256,8192,5120,5120,52,0,116.327,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3692.15,1014.08,0.0 -256,8192,5120,6400,54,0,132.8247,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4041.95,952.27,0.0 -256,8192,5120,25600,52,0,429.3911,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,5001.23,592.19,0.0 -256,8192,6144,3072,54,0,87.9918,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3514.39,1394.26,0.0 -256,8192,6144,4096,45,0,106.5776,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3868.7,1219.99,0.0 -256,8192,6144,12288,54,0,240.4063,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5145.25,785.1,0.0 -256,8192,6144,16384,54,0,308.4804,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5346.43,707.03,0.0 -256,8192,6400,5120,54,0,148.146,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3623.93,959.95,0.0 -256,8192,7168,2048,10,0,135.959,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1769.05,979.48,0.0 -256,8192,7168,2304,10,0,145.1712,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1863.89,930.87,0.0 -256,8192,7168,8192,54,0,210.4687,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4571.1,856.92,0.0 -256,8192,7168,16384,54,0,376.4758,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5110.94,646.18,0.0 -256,8192,7168,18432,54,0,419.0895,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5165.16,618.0,0.0 -256,8192,8192,1024,54,0,55.9419,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2456.82,2549.19,0.0 -256,8192,8192,2048,54,0,84.2984,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3260.77,1791.2,0.0 -256,8192,8192,3584,54,0,119.4879,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4025.82,1368.99,0.0 -256,8192,8192,4096,54,0,129.4913,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4245.5,1295.62,0.0 -256,8192,8192,7168,54,0,197.8445,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4862.77,975.2,0.0 -256,8192,8192,8192,54,0,216.0155,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5089.97,932.0,0.0 -256,8192,8192,28672,54,0,668.518,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5756.45,552.11,0.0 -256,8192,9216,16384,54,0,475.0672,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5207.48,618.02,0.0 -256,8192,10240,8192,45,0,277.173,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4958.6,877.68,0.0 -256,8192,12288,512,45,0,66.5444,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1549.03,3104.24,0.0 -256,8192,12288,1536,45,0,107.0387,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2889.03,2027.82,0.0 -256,8192,12288,4096,45,0,190.4456,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4330.02,1277.37,0.0 -256,8192,12288,6144,45,0,258.7637,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4780.23,1021.17,0.0 -256,8192,12800,5120,54,0,257.7219,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4166.28,1022.24,0.0 -256,8192,13312,16384,54,0,671.4716,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5321.76,587.16,0.0 -256,8192,14336,8192,54,0,387.4984,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4965.56,844.28,0.0 -256,8192,16384,512,54,0,86.6263,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1586.57,3171.4,0.0 -256,8192,16384,2048,45,0,155.4501,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3536.54,1888.72,0.0 -256,8192,16384,4096,54,0,242.0538,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4542.43,1316.93,0.0 -256,8192,16384,6656,54,0,366.2828,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4877.94,956.16,0.0 -256,8192,16384,8192,54,0,425.1217,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5172.69,868.22,0.0 -256,8192,16384,13312,54,0,657.0434,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5438.63,657.51,0.0 -256,8192,16384,16384,54,0,786.0032,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5595.46,597.66,0.0 -256,8192,16384,26624,54,0,1246.495,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5733.54,477.81,0.0 -256,8192,16384,53248,54,0,2463.2448,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5802.77,374.61,0.0 -256,8192,18432,7168,54,0,435.3143,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4972.65,912.93,0.0 -256,8192,20480,16384,54,0,982.1553,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5597.44,580.79,0.0 -256,8192,24576,1536,45,0,208.6912,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2963.59,2050.01,0.0 -256,8192,26624,16384,54,0,1399.3116,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5107.39,515.55,0.0 -256,8192,28672,4096,54,0,433.7855,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4435.71,1256.98,0.0 -256,8192,32768,512,36,0,172.7725,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_64x1024E,1590.98,3168.08,0.0 -256,8192,51200,5120,54,0,926.9073,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4633.65,1069.04,0.0 -256,8192,53248,16384,54,0,2561.6588,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5579.84,537.05,0.0 -256,8192,57344,8192,54,0,1433.2252,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5370.11,842.83,0.0 -256,8192,59136,8192,54,0,1525.5281,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5202.85,815.89,0.0 -256,8192,106496,16384,54,0,5127.9293,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5574.82,523.48,0.0 -256,9984,15360,13824,54,0,814.223,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5207.34,591.84,0.0 -256,12416,8960,15360,54,0,651.6207,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5244.64,593.39,0.0 -256,12416,11136,12544,54,0,686.4753,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5053.03,618.01,0.0115 -256,12800,9344,12800,54,0,611.4549,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5007.47,622.99,0.0137 -256,13056,43392,1792,54,0,607.1164,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3344.38,1949.59,0.0029 -256,13568,13312,10240,54,0,733.9988,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5039.57,679.65,0.0 -256,14720,13568,8704,54,0,727.3392,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4780.08,718.44,0.0 -256,16384,512,4096,43,0,24.3647,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2820.45,2108.8,0.0 -256,16384,512,7168,9,0,60.4461,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1989.53,1279.36,0.0 -256,16384,800,5120,21,0,48.5545,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2764.27,1445.91,0.0 -256,16384,1024,3072,45,0,33.1376,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3110.64,1819.48,0.0 -256,16384,1280,8192,50,0,98.537,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3486.99,1159.92,0.0 -256,16384,1536,7168,44,0,94.2122,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3829.41,1215.95,0.0 -256,16384,2048,6144,45,0,102.2609,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4032.01,1209.96,0.0 -256,16384,2304,16384,44,0,264.3801,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4678.68,864.62,0.0 -256,16384,2560,8192,45,0,164.9847,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4165.2,978.76,0.0 -256,16384,3072,1536,10,0,97.6538,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1583.34,1183.83,0.0 -256,16384,3072,6144,54,0,141.9524,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4356.92,1130.18,0.0 -256,16384,4096,4096,54,0,132.0097,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4164.51,1334.45,0.0 -256,16384,4096,8192,54,0,226.9404,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4844.94,961.06,0.0 -256,16384,4096,14336,54,0,363.6314,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5291.47,772.81,0.0 -256,16384,4608,7168,10,0,454.2569,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2382.64,498.02,0.0 -256,16384,4608,16384,54,0,486.8838,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5081.09,663.32,0.0 -256,16384,5120,1280,45,0,84.5625,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2539.52,2146.75,0.0 -256,16384,5120,5120,45,0,200.0472,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4293.95,1113.85,0.0 -256,16384,5120,6400,54,0,234.6066,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4576.78,1008.43,0.0 -256,16384,5120,25600,54,0,815.9065,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5264.04,542.98,0.0 -256,16384,6144,3072,45,0,161.8127,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3822.17,1458.04,0.0 -256,16384,6144,4096,54,0,193.6462,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4258.46,1277.92,0.0 -256,16384,6144,12288,54,0,467.7627,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5288.8,726.31,0.0 -256,16384,6144,16384,54,0,600.1017,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5496.63,643.02,0.0 -256,16384,6400,5120,45,0,268.7424,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3995.43,997.39,0.012499047 -256,16384,7168,2048,10,0,269.441,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1785.31,961.24,0.0 -256,16384,7168,2304,10,0,288.9433,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1872.91,906.8,0.0 -256,16384,7168,8192,54,0,383.7231,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5014.41,863.51,0.0 -256,16384,7168,16384,54,0,703.6728,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5468.86,607.98,0.0 -256,16384,7168,18432,54,0,791.4978,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5469.79,570.99,0.0 -256,16384,8192,1024,54,0,107.806,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2549.75,2606.7,0.0 -256,16384,8192,2048,54,0,159.5154,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3446.41,1840.58,0.0 -256,16384,8192,3584,54,0,233.3336,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4123.16,1339.18,0.0 -256,16384,8192,4096,54,0,254.4673,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4320.84,1252.68,0.0 -256,16384,8192,7168,54,0,398.3153,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4830.71,895.06,0.0 -256,16384,8192,8192,54,0,429.289,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5122.48,859.79,0.0 -256,16384,8192,28672,54,0,1325.5902,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5806.15,468.29,0.0 -256,16384,9216,16384,54,0,901.7683,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5486.78,567.45,0.0 -256,16384,10240,8192,54,0,544.3326,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5049.82,816.77,0.0 -256,16384,12288,512,45,0,125.1737,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1646.98,3275.39,0.0 -256,16384,12288,1536,45,0,206.8407,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2990.1,2053.14,0.0 -256,16384,12288,4096,45,0,370.0402,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4457.0,1246.82,0.0 -256,16384,12288,6144,54,0,509.7273,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4853.38,962.74,0.0 -256,16384,12800,5120,54,0,486.651,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4412.78,1015.39,0.0 -256,16384,13312,16384,54,0,1289.7301,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5541.33,526.84,0.0 -256,16384,14336,8192,54,0,757.6926,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5078.96,786.06,0.0 -256,16384,16384,512,54,0,156.2988,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1758.67,3488.57,0.0 -256,16384,16384,2048,54,0,303.603,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3621.54,1878.85,0.0 -256,16384,16384,4096,54,0,478.5568,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4595.11,1262.09,0.0 -256,16384,16384,6656,54,0,721.8789,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4950.16,894.78,0.0 -256,16384,16384,8192,54,0,832.934,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5280.19,805.69,0.0 -256,16384,16384,13312,54,0,1296.2628,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5513.41,582.42,0.0 -256,16384,16384,16384,54,0,1563.1208,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5627.26,515.19,0.0 -256,16384,16384,26624,54,0,2485.1392,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5751.65,391.56,0.0 -256,16384,16384,53248,54,0,5103.8959,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5601.07,276.12,0.0 -256,16384,18432,7168,54,0,862.0836,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5021.93,845.35,0.0 -256,16384,18432,16384,10,0,4031.1796,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2454.77,220.58,0.0 -256,16384,20480,16384,45,0,1960.3005,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,5608.89,496.39,0.0 -256,16384,24576,1536,45,0,399.9546,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3092.73,2092.15,0.0 -256,16384,26624,16384,54,0,2587.5139,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5524.09,473.33,0.0 -256,16384,28672,4096,54,0,861.417,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4467.4,1197.79,0.0 -256,16384,32768,512,36,0,345.2436,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_64x1024E,1592.37,3146.55,0.0 -256,16384,51200,5120,54,0,1983.796,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4330.05,932.93,0.0 -256,16384,53248,16384,54,0,5172.8223,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5526.44,447.58,0.0 -256,16384,57344,8192,54,0,2953.8423,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5211.23,738.37,0.0 -256,16384,59136,8192,54,0,3103.1713,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5115.48,724.13,0.0 -256,16384,106496,16384,54,0,10335.4736,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5531.88,435.04,0.0 -256,16896,31104,7168,54,0,1565.4111,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4812.82,781.33,0.0041 -256,20480,512,7168,7,0,72.5979,a4w4_blockscale_256x96x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2070.64,1325.2,0.0 -256,20480,1536,7168,10,0,199.649,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2258.82,710.35,0.0 -256,20480,3072,1536,10,0,118.3924,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1632.48,1215.59,0.0 -256,20480,4608,7168,10,0,572.5215,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2363.08,486.72,0.0 -256,20480,7168,2048,10,0,333.465,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1803.17,965.36,0.0 -256,20480,7168,2304,10,0,355.0349,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1905.33,916.68,0.0 -256,22784,1664,61696,52,0,1024.4489,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4566.46,810.19,0.0226 -256,24192,32384,2304,45,0,1001.4336,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3604.89,1629.71,0.0043 -256,32768,1024,3072,45,0,61.3382,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3361.01,1940.28,0.0 -256,32768,1280,8192,54,0,167.5074,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4102.47,1333.35,0.0 -256,32768,1536,7168,45,0,162.1303,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4450.46,1379.19,0.0 -256,32768,2048,6144,45,0,190.7134,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4323.94,1264.58,0.0 -256,32768,2112,7168,48,0,293.0219,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3385.88,898.98,0.0 -256,32768,2304,16384,48,0,505.9259,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4889.85,866.34,0.0 -256,32768,2560,8192,54,0,293.3453,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4685.23,1065.21,0.0 -256,32768,3072,6144,45,0,272.8936,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4532.72,1141.2,0.0 -256,32768,4096,4096,54,0,255.6975,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4300.05,1345.08,0.0 -256,32768,4096,8192,54,0,444.8527,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4943.26,942.85,0.0 -256,32768,4096,14336,54,0,721.3297,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5335.0,738.46,0.0 -256,32768,4608,16384,54,0,906.1081,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5460.5,671.19,0.0 -256,32768,5120,1280,45,0,163.3916,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2628.63,2202.03,0.0 -256,32768,5120,5120,45,0,392.6814,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4375.01,1101.5,0.0 -256,32768,5120,6400,54,0,457.7773,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4691.11,997.83,0.0 -256,32768,5120,25600,54,0,1545.9348,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5556.47,530.75,0.0 -256,32768,6144,3072,54,0,315.1874,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3924.49,1467.13,0.0 -256,32768,6144,4096,54,0,377.8643,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4364.71,1276.5,0.0 -256,32768,6144,12288,54,0,933.4489,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5300.56,687.48,0.0 -256,32768,6144,16384,54,0,1198.9958,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5502.16,601.69,0.0 -256,32768,6400,5120,45,0,518.2698,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4143.56,1002.76,0.016083188 -256,32768,7168,8192,54,0,763.9579,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5037.31,829.02,0.0 -256,32768,7168,16384,54,0,1414.8817,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5439.73,563.24,0.0 -256,32768,7168,18432,54,0,1586.5235,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5457.63,528.08,0.0 -256,32768,8192,1024,54,0,211.6275,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2597.75,2635.96,0.0 -256,32768,8192,2048,45,0,315.6601,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3483.21,1833.66,0.0 -256,32768,8192,3584,54,0,464.7093,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4140.54,1313.23,0.0 -256,32768,8192,4096,54,0,483.3855,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4549.21,1284.19,0.0 -256,32768,8192,7168,54,0,782.3144,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4919.11,873.91,0.0 -256,32768,8192,8192,54,0,843.7771,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5212.33,835.11,0.0 -256,32768,8192,28672,54,0,2654.9427,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5797.93,423.39,0.0 -256,32768,9216,16384,54,0,1780.1369,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5558.9,532.49,0.0 -256,32768,10240,8192,45,0,1088.4791,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,5050.68,778.38,0.0 -256,32768,12288,512,45,0,245.9473,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1676.44,3321.2,0.0 -256,32768,12288,1536,45,0,400.2607,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3090.36,2098.41,0.0 -256,32768,12288,4096,45,0,732.0118,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4506.12,1226.18,0.0 -256,32768,12288,6144,45,0,1020.7607,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4847.17,924.52,0.0 -256,32768,12800,5120,54,0,997.4188,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4306.08,957.99,0.0 -256,32768,13312,16384,54,0,2590.6993,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5517.29,482.46,0.0 -256,32768,14336,8192,54,0,1537.4579,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5006.04,736.58,0.0 -256,32768,16384,512,54,0,319.1351,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1722.64,3403.97,0.0 -256,32768,16384,2048,45,0,593.0636,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3707.9,1895.37,0.0 -256,32768,16384,4096,54,0,958.7045,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4587.49,1224.99,0.0 -256,32768,16384,6656,54,0,1434.6488,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4981.59,862.45,0.0 -256,32768,16384,8192,54,0,1680.4113,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5234.49,758.78,0.0 -256,32768,16384,13312,54,0,2618.0472,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5459.66,535.09,0.0 -256,32768,16384,16384,45,0,3077.3314,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,5716.7,479.76,0.0 -256,32768,16384,26624,54,0,4978.7035,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5741.92,347.09,0.0 -256,32768,16384,53248,54,0,9949.223,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5746.64,239.45,0.0 -256,32768,18432,7168,45,0,1748.9945,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4950.65,795.58,0.0 -256,32768,20480,16384,45,0,3960.5471,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,5552.32,449.03,0.0 -256,32768,24576,1536,45,0,800.9843,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3088.58,2065.77,0.0 -256,32768,26624,16384,54,0,5494.2956,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5203.09,406.12,0.0 -256,32768,28672,4096,45,0,1707.3148,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4508.0,1174.29,0.0 -256,32768,32768,512,36,0,666.2533,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_64x1024E,1650.29,3248.41,0.0 -256,32768,51200,5120,54,0,4066.2238,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4225.02,878.06,0.0 -256,32768,53248,16384,54,0,10524.9535,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5432.29,398.51,0.0 -256,32768,57344,8192,54,0,6141.7014,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5012.67,672.0,0.0 -256,32768,59136,8192,54,0,6136.6529,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5173.57,692.88,0.0 -256,32768,106496,16384,54,0,20606.9162,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5549.07,394.05,0.0 -256,33280,7168,10752,54,0,999.5708,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5132.0,694.85,0.0 -256,35200,256,19968,50,0,106.6743,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3373.55,3487.39,0.0 -256,48256,61056,5376,45,0,7047.1599,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4495.26,877.87,0.0052 -256,51712,14976,7680,54,0,2503.0596,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4752.34,721.1,0.0085 -256,51968,61696,4608,54,0,6682.8336,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4421.55,998.73,0.0 -256,56832,44416,1280,52,0,2488.2075,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,2597.08,2055.01,0.0029 -256,60000,4096,512,54,0,165.0566,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1524.68,3077.3,0.0 -256,64896,1280,60672,54,0,2002.6549,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5033.15,1085.38,0.0 -256,65536,2112,7168,48,0,575.6528,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3447.0,902.06,0.0 -256,65536,16384,8192,54,0,3353.2726,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5246.27,740.48,0.0 -256,168448,3200,6400,54,0,1573.7469,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4384.21,1034.05,0.0189 -256,540544,7552,1024,54,0,3458.7874,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2417.13,2441.61,0.0082 -256,721536,6016,1024,54,0,3558.6675,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2498.09,2544.22,0.0104 -256,838784,5760,3584,45,0,8593.0777,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4030.16,1300.61,0.031 +gfx,cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,1,512,4096,21,0,5.4049,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,0.78,194.57,0.0 +gfx950,256,1,800,5120,29,0,8.5995,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,0.95,238.64,0.0 +gfx950,256,1,1024,3072,21,0,5.7795,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1.09,272.77,0.0 +gfx950,256,1,1024,4096,29,0,6.7429,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1.24,311.62,0.0 +gfx950,256,1,1280,8192,21,0,16.726,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1.25,313.85,0.0 +gfx950,256,1,1536,3072,29,0,6.5636,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1.44,360.15,0.0 +gfx950,256,1,1536,7168,21,0,12.1915,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1.81,452.09,0.0 +gfx950,256,1,2048,6144,29,0,10.9087,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2.31,577.39,0.0 +gfx950,256,1,2048,7168,21,0,12.2956,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,2.39,597.59,0.0 +gfx950,256,1,2112,7168,21,0,12.3647,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,2.45,612.81,0.0 +gfx950,256,1,2304,16384,21,0,22.5794,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.34,836.48,0.0 +gfx950,256,1,2560,8192,21,0,17.5745,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,2.39,597.17,0.0 +gfx950,256,1,3072,1536,21,0,5.7648,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1.64,410.46,0.0 +gfx950,256,1,3072,6144,21,0,10.8323,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.48,872.06,0.0 +gfx950,256,1,4096,1024,29,0,4.9501,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1.69,425.42,0.0 +gfx950,256,1,4096,4096,21,0,8.4223,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.98,997.22,0.0 +gfx950,256,1,4096,8192,21,0,12.3973,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.41,1354.29,0.0 +gfx950,256,1,4096,14336,21,0,20.9628,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.6,1401.32,0.0 +gfx950,256,1,4608,16384,21,0,24.1342,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.26,1564.84,0.0 +gfx950,256,1,5120,1280,29,0,6.2868,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2.08,522.95,0.0 +gfx950,256,1,5120,5120,21,0,10.3166,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.08,1271.74,0.0 +gfx950,256,1,5120,6400,21,0,13.1399,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,4.99,1247.91,0.0 +gfx950,256,1,5120,25600,21,0,35.9675,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,7.29,1822.73,0.0 +gfx950,256,1,6144,2048,29,0,6.2966,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,4.0,1001.3,0.0 +gfx950,256,1,6144,3072,21,0,7.0757,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.33,1335.7,0.0 +gfx950,256,1,6144,4096,21,0,8.4464,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.96,1491.43,0.0 +gfx950,256,1,6144,12288,21,0,18.4152,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,8.2,2050.87,0.0 +gfx950,256,1,6144,16384,21,0,24.3831,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,8.26,2065.04,0.0 +gfx950,256,1,6400,5120,29,0,12.2475,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,5.35,1339.0,0.0 +gfx950,256,1,7168,2048,29,0,6.3797,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,4.6,1152.94,0.0 +gfx950,256,1,7168,8192,21,0,13.6832,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,8.58,2147.05,0.0 +gfx950,256,1,7168,16384,21,0,24.8921,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.44,2359.9,0.0 +gfx950,256,1,7168,18432,21,0,27.4078,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.64,2411.13,0.0 +gfx950,256,1,8192,1024,21,0,5.9074,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,2.84,712.87,0.0 +gfx950,256,1,8192,2048,21,0,6.6356,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,5.06,1266.81,0.0 +gfx950,256,1,8192,3584,21,0,8.4146,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.98,1746.75,0.0 +gfx950,256,1,8192,4096,37,0,8.8326,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,7.6,1901.55,0.0 +gfx950,256,1,8192,7168,42,0,17.189,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,6.83,1709.24,0.0 +gfx950,256,1,8192,8192,21,0,14.1316,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.5,2375.87,0.0 +gfx950,256,1,8192,28672,21,0,43.2883,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,10.85,2713.69,0.0 +gfx950,256,1,9216,16384,21,0,26.0135,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.61,2903.27,0.0 +gfx950,256,1,10240,8192,21,0,14.6562,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.45,2863.47,0.0 +gfx950,256,1,12288,512,29,0,4.3707,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2.88,725.41,0.0 +gfx950,256,1,12288,1536,29,0,5.8524,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,6.45,1616.86,0.0 +gfx950,256,1,12288,4096,21,0,9.5064,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,10.59,2650.05,0.0 +gfx950,256,1,12288,6144,21,0,12.76,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.83,2960.53,0.0 +gfx950,256,1,12800,5120,21,0,14.2687,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.19,2298.47,0.0 +gfx950,256,1,13312,16384,21,0,33.2284,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,13.13,3282.94,0.0 +gfx950,256,1,14336,8192,37,0,17.338,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,13.55,3388.69,0.0 +gfx950,256,1,16384,512,29,0,4.4281,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,3.79,954.66,0.0 +gfx950,256,1,16384,2048,37,0,6.2861,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,10.68,2674.31,0.0 +gfx950,256,1,16384,4096,21,0,11.8228,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.35,2841.06,0.0 +gfx950,256,1,16384,6656,37,0,15.0912,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,14.45,3615.49,0.0 +gfx950,256,1,16384,8192,29,0,19.807,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,13.55,3390.0,0.0 +gfx950,256,1,16384,13312,21,0,24.7664,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,17.61,4404.81,0.0 +gfx950,256,1,16384,16384,29,0,29.8545,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,17.98,4497.1,0.0 +gfx950,256,1,16384,26624,29,0,47.945,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,18.2,4550.0,0.0 +gfx950,256,1,16384,53248,21,0,85.9036,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,20.31,5078.56,0.0 +gfx950,256,1,18432,7168,21,0,14.9154,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,17.72,4431.71,0.0 +gfx950,256,1,18432,16384,0,0,48.5148,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,12.45,3113.28,0.0 +gfx950,256,1,20480,16384,21,0,34.0333,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,19.72,4931.09,0.0 +gfx950,256,1,24576,1536,21,0,6.1017,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,12.37,3101.48,0.0 +gfx950,256,1,26624,16384,46,0,44.6996,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,19.52,4880.7,0.0 +gfx950,256,1,28672,4096,21,0,14.0433,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.73,4185.6,0.0 +gfx950,256,1,32768,512,29,0,5.0283,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,6.67,1681.36,0.0 +gfx950,256,1,51200,5120,14,0,23.3643,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,22.44,5614.42,0.0 +gfx950,256,1,53248,16384,0,0,73.6484,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,23.69,5924.4,0.0 +gfx950,256,1,57344,8192,14,0,44.7024,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,21.02,5256.98,0.0 +gfx950,256,1,59136,8192,14,0,46.5332,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,20.82,5207.97,0.0 +gfx950,256,1,106496,16384,3,0,145.2761,a4w4_blockscale_256x64x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,24.02,6006.74,0.0 +gfx950,256,2,512,4096,21,0,5.4335,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1.54,194.11,0.0 +gfx950,256,2,1024,3072,21,0,5.8351,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,2.16,270.78,0.0 +gfx950,256,2,1024,4096,29,0,6.8239,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2.46,308.53,0.0 +gfx950,256,2,1536,3072,29,0,6.3888,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2.95,370.73,0.0 +gfx950,256,2,1536,7168,21,0,12.3332,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.57,447.44,0.0 +gfx950,256,2,2048,6144,21,0,10.8043,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,4.66,583.64,0.0 +gfx950,256,2,2048,7168,21,0,12.4844,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,4.7,589.17,0.0 +gfx950,256,2,3072,1536,21,0,5.8068,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.25,408.68,0.0 +gfx950,256,2,3072,6144,21,0,10.7991,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.99,875.59,0.0 +gfx950,256,2,4096,1024,29,0,5.0164,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,3.34,421.53,0.0 +gfx950,256,2,4096,4096,21,0,9.4735,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,7.08,887.64,0.0 +gfx950,256,2,4096,8192,21,0,12.6033,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,10.65,1333.13,0.0 +gfx950,256,2,4096,14336,21,0,21.0437,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.16,1396.66,0.0 +gfx950,256,2,6144,2048,21,0,6.2664,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,8.03,1008.25,0.0 +gfx950,256,2,6144,3072,21,0,7.4379,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,10.15,1272.51,0.0 +gfx950,256,2,6144,4096,21,0,9.2767,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,10.85,1359.49,0.0 +gfx950,256,2,6144,12288,21,0,18.5205,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.31,2040.2,0.0 +gfx950,256,2,6144,16384,21,0,24.4987,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.44,2056.13,0.0 +gfx950,256,2,7168,2048,21,0,6.2888,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.34,1172.04,0.0 +gfx950,256,2,7168,8192,21,0,13.8337,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.98,2125.03,0.0 +gfx950,256,2,7168,16384,21,0,25.1389,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,18.69,2337.62,0.0 +gfx950,256,2,7168,18432,21,0,27.5631,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,19.17,2398.4,0.0 +gfx950,256,2,8192,4096,21,0,8.3685,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.04,2009.21,0.0 +gfx950,256,2,8192,8192,21,0,14.1231,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,19.01,2378.75,0.0 +gfx950,256,2,8192,28672,21,0,44.8066,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,20.97,2622.43,0.0 +gfx950,256,2,10240,8192,21,0,14.7818,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.7,2840.8,0.0 +gfx950,256,2,12288,512,29,0,4.3287,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,5.81,738.19,0.0 +gfx950,256,2,12288,1536,29,0,6.1797,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,12.22,1535.33,0.0 +gfx950,256,2,12288,4096,21,0,9.7868,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,20.57,2576.85,0.0 +gfx950,256,2,12288,6144,29,0,11.8365,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,25.51,3193.85,0.0 +gfx950,256,2,16384,512,21,0,4.3413,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,7.73,981.35,0.0 +gfx950,256,2,16384,2048,21,0,6.2153,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,21.59,2710.22,0.0 +gfx950,256,2,16384,16384,29,0,32.719,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,32.82,4104.64,0.0 +gfx950,256,2,16384,53248,21,0,86.2066,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,40.48,5061.4,0.0 +gfx950,256,2,18432,7168,21,0,14.9357,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,35.38,4428.4,0.0 +gfx950,256,2,20480,16384,21,0,32.5723,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,41.21,5153.78,0.0 +gfx950,256,2,24576,1536,21,0,6.1676,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,24.48,3076.43,0.0 +gfx950,256,2,28672,4096,29,0,14.0902,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,33.34,4175.88,0.0 +gfx950,256,2,32768,512,21,0,5.2683,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,12.74,1617.26,0.0 +gfx950,256,2,57344,8192,14,0,44.6413,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,42.09,5266.84,0.0 +gfx950,256,2,59136,8192,14,0,46.4974,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,41.67,5214.61,0.0 +gfx950,256,2,106496,16384,17,0,148.879,a4w4_blockscale_256x32x512x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,46.88,5862.87,0.0 +gfx950,256,4,512,4096,21,0,5.4783,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,3.06,193.65,0.0 +gfx950,256,4,1024,3072,21,0,5.8909,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,4.27,269.43,0.0 +gfx950,256,4,1024,4096,29,0,6.9926,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,4.8,302.25,0.0 +gfx950,256,4,1536,3072,29,0,6.8371,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,5.52,347.77,0.0 +gfx950,256,4,1536,7168,21,0,12.4212,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,7.09,445.34,0.0 +gfx950,256,4,2048,6144,21,0,10.9315,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.21,578.16,0.0 +gfx950,256,4,2048,7168,21,0,12.6032,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,9.32,584.83,0.0 +gfx950,256,4,3072,1536,21,0,5.7958,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.51,411.84,0.0 +gfx950,256,4,3072,6144,21,0,10.8927,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,13.86,869.76,0.0 +gfx950,256,4,4096,1024,29,0,5.0499,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,6.64,422.18,0.0 +gfx950,256,4,4096,4096,21,0,8.8735,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,15.13,949.97,0.0 +gfx950,256,4,4096,8192,21,0,12.641,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,21.24,1331.09,0.0 +gfx950,256,4,4096,14336,21,0,21.2148,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.14,1386.84,0.0 +gfx950,256,4,6144,2048,21,0,6.4043,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,15.72,990.69,0.0 +gfx950,256,4,6144,3072,21,0,7.677,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,19.67,1236.48,0.0 +gfx950,256,4,6144,4096,21,0,8.9851,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.41,1406.8,0.0 +gfx950,256,4,6144,12288,21,0,18.6823,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,32.33,2024.51,0.0 +gfx950,256,4,6144,16384,21,0,24.7138,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,32.59,2039.9,0.0 +gfx950,256,4,7168,2048,37,0,6.45,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,18.21,1147.52,0.0 +gfx950,256,4,7168,8192,21,0,13.9848,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,33.59,2104.7,0.0 +gfx950,256,4,7168,16384,21,0,25.2649,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,37.19,2327.75,0.0 +gfx950,256,4,7168,18432,21,0,27.7628,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,38.07,2382.85,0.0 +gfx950,256,4,8192,4096,21,0,8.9554,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,29.97,1881.65,0.0 +gfx950,256,4,8192,8192,21,0,14.12,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,38.02,2382.18,0.0 +gfx950,256,4,8192,28672,29,0,45.3869,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,41.4,2590.25,0.0 +gfx950,256,4,10240,8192,21,0,14.8201,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,45.28,2836.78,0.0 +gfx950,256,4,12288,512,29,0,4.3638,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,11.53,743.63,0.0 +gfx950,256,4,12288,1536,21,0,5.9719,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,25.28,1597.24,0.0 +gfx950,256,4,12288,4096,37,0,9.9657,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,40.4,2535.93,0.0 +gfx950,256,4,12288,6144,21,0,11.6196,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,51.98,3258.23,0.0 +gfx950,256,4,16384,512,29,0,4.4621,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,15.04,969.59,0.0 +gfx950,256,4,16384,2048,21,0,6.5496,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,40.99,2582.2,0.0 +gfx950,256,4,16384,4096,22,0,21.86,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,24.56,1541.34,0.0 +gfx950,256,4,16384,16384,21,0,32.3766,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,66.33,4150.58,0.0 +gfx950,256,4,16384,53248,21,0,87.0063,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,80.22,5016.25,0.0 +gfx950,256,4,18432,7168,21,0,14.8505,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,71.17,4459.25,0.0 +gfx950,256,4,18432,16384,0,0,47.9528,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,50.38,3152.58,0.0 +gfx950,256,4,20480,16384,21,0,32.6683,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,82.17,5141.64,0.0 +gfx950,256,4,24576,1536,21,0,6.1764,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,48.89,3088.21,0.0 +gfx950,256,4,28672,4096,21,0,13.9716,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,67.25,4219.83,0.0 +gfx950,256,4,32768,512,21,0,5.2987,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,25.33,1632.81,0.0 +gfx950,256,4,57344,8192,14,0,44.7537,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,83.97,5258.92,0.0 +gfx950,256,4,59136,8192,14,0,46.6928,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,83.0,5198.03,0.0 +gfx950,256,4,106496,16384,3,0,147.4268,a4w4_blockscale_256x64x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,94.68,5923.62,0.0 +gfx950,256,8,512,4096,21,0,5.5567,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.04,193.13,0.0 +gfx950,256,8,512,7168,29,0,9.6677,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,6.07,193.62,0.0 +gfx950,256,8,1024,3072,21,0,6.0269,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,8.35,265.73,0.0 +gfx950,256,8,1024,4096,29,0,6.937,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,9.67,307.04,0.0 +gfx950,256,8,1536,3072,29,0,6.7419,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,11.2,355.41,0.0 +gfx950,256,8,1536,7168,21,0,12.6568,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,13.92,439.15,0.0 +gfx950,256,8,2048,6144,29,0,11.2567,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,17.89,564.0,0.0 +gfx950,256,8,2048,7168,21,0,12.8216,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,18.32,577.27,0.0 +gfx950,256,8,2112,7168,21,0,12.8435,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,18.86,594.22,0.0 +gfx950,256,8,3072,1536,42,0,5.4682,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,13.81,441.57,0.0 +gfx950,256,8,3072,6144,21,0,11.0931,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,27.22,857.37,0.0 +gfx950,256,8,4096,1024,29,0,5.0957,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,13.17,425.22,0.0 +gfx950,256,8,4096,4096,21,0,9.0927,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,29.52,931.57,0.0 +gfx950,256,8,4096,8192,21,0,12.8815,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,41.68,1310.06,0.0 +gfx950,256,8,4096,14336,21,0,21.6872,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,43.32,1359.47,0.0 +gfx950,256,8,6144,2048,29,0,6.4847,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,31.05,986.62,0.0 +gfx950,256,8,6144,3072,21,0,7.2755,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,41.51,1312.32,0.0 +gfx950,256,8,6144,4096,21,0,9.0955,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,44.27,1396.03,0.0 +gfx950,256,8,6144,12288,21,0,19.0255,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,63.49,1991.86,0.0 +gfx950,256,8,6144,16384,21,0,25.2355,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,63.82,2000.97,0.0 +gfx950,256,8,7168,2048,29,0,5.836,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,40.25,1278.77,0.0 +gfx950,256,8,7168,8192,21,0,14.1279,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,66.5,2088.6,0.0 +gfx950,256,8,7168,16384,21,0,25.6172,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,73.35,2299.26,0.0 +gfx950,256,8,7168,18432,21,0,28.2761,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,74.76,2342.92,0.0 +gfx950,256,8,8192,4096,21,0,8.5155,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,63.05,1987.51,0.0 +gfx950,256,8,8192,8192,21,0,14.4423,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,74.35,2334.69,0.0 +gfx950,256,8,8192,28672,21,0,45.9253,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,81.83,2562.56,0.0 +gfx950,256,8,10240,8192,21,0,15.0535,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,89.16,2799.33,0.0 +gfx950,256,8,12288,512,29,0,4.4584,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,22.58,750.13,0.0 +gfx950,256,8,12288,1536,21,0,5.8293,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,51.81,1653.7,0.0 +gfx950,256,8,12288,4096,29,0,9.7782,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,82.36,2595.45,0.0 +gfx950,256,8,12288,6144,21,0,11.7431,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,102.87,3233.38,0.0 +gfx950,256,8,16384,512,42,0,4.8202,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,27.84,924.96,0.0 +gfx950,256,8,16384,2048,21,0,6.3214,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,84.93,2696.8,0.0 +gfx950,256,8,16384,16384,21,0,29.9374,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,143.46,4494.22,0.0 +gfx950,256,8,16384,53248,21,0,88.6774,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,157.41,4924.4,0.0 +gfx950,256,8,18432,7168,21,0,15.1166,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,139.84,4391.46,0.0 +gfx950,256,8,18432,16384,22,0,59.6071,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,81.06,2539.22,0.0 +gfx950,256,8,20480,16384,21,0,32.719,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,164.09,5139.69,0.0 +gfx950,256,8,24576,1536,21,0,6.2605,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,96.47,3078.62,0.0 +gfx950,256,8,26624,16384,24,0,82.4334,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,84.67,2651.78,0.0 +gfx950,256,8,28672,4096,29,0,13.8939,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,135.24,4260.53,0.0 +gfx950,256,8,32768,512,21,0,5.0403,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,53.26,1768.73,0.0 +gfx950,256,8,57344,8192,14,0,44.7964,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,167.79,5264.51,0.0 +gfx950,256,8,59136,8192,14,0,46.5871,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,166.38,5220.33,0.0 +gfx950,256,8,106496,16384,3,0,147.25,a4w4_blockscale_256x64x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,189.59,5936.74,0.0 +gfx950,256,16,512,4096,21,0,5.6787,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,11.82,193.31,0.0 +gfx950,256,16,512,7168,4,0,16.8692,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,6.96,113.15,0.0 +gfx950,256,16,800,5120,22,0,21.2953,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,6.15,99.3,0.0 +gfx950,256,16,1024,3072,21,0,6.1362,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,16.4,265.67,0.0 +gfx950,256,16,1024,4096,29,0,7.1983,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,18.65,300.44,0.0 +gfx950,256,16,1280,8192,42,0,16.8681,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,19.89,317.13,0.0 +gfx950,256,16,1536,3072,21,0,6.7767,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.28,359.03,0.0 +gfx950,256,16,1536,7168,21,0,13.1256,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,26.84,427.52,0.0 +gfx950,256,16,2048,6144,21,0,11.4627,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,35.13,558.87,0.0 +gfx950,256,16,2048,7168,21,0,13.274,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,35.39,562.22,0.0 +gfx950,256,16,2304,16384,21,0,24.026,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,50.28,794.11,0.0 +gfx950,256,16,2560,8192,37,0,17.9374,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,37.41,592.8,0.0 +gfx950,256,16,3072,1536,21,0,6.009,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,25.13,411.03,0.0 +gfx950,256,16,3072,6144,21,0,11.4387,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,52.8,837.91,0.0 +gfx950,256,16,4096,1024,29,0,5.2016,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,25.8,429.95,0.0 +gfx950,256,16,4096,4096,21,0,9.4039,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,57.09,909.46,0.0 +gfx950,256,16,4096,8192,21,0,13.3131,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,80.65,1274.97,0.0 +gfx950,256,16,4096,14336,21,0,22.4402,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,83.74,1319.32,0.0 +gfx950,256,16,4608,7168,0,0,22.3101,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,47.38,749.43,0.0 +gfx950,256,16,4608,16384,21,0,25.737,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,93.87,1477.53,0.0 +gfx950,256,16,5120,1280,29,0,6.4902,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,32.31,531.71,0.0 +gfx950,256,16,5120,5120,37,0,14.0232,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,59.82,949.28,0.0 +gfx950,256,16,5120,6400,46,0,14.2958,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,73.35,1161.11,0.0 +gfx950,256,16,5120,25600,21,0,39.0242,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,107.48,1688.81,0.0 +gfx950,256,16,6144,2048,21,0,6.5528,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,61.45,992.62,0.0 +gfx950,256,16,6144,3072,29,0,7.6676,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,78.77,1259.63,0.0 +gfx950,256,16,6144,4096,29,0,9.8679,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,81.61,1298.38,0.0 +gfx950,256,16,6144,12288,21,0,19.6767,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,122.78,1933.44,0.0 +gfx950,256,16,6144,16384,21,0,25.9976,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,123.9,1948.62,0.0 +gfx950,256,16,6400,5120,29,0,13.8091,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,75.93,1204.26,0.0 +gfx950,256,16,7168,2048,21,0,6.4858,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,72.43,1169.6,0.0 +gfx950,256,16,7168,2304,0,0,9.5571,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,55.3,889.95,0.0 +gfx950,256,16,7168,8192,21,0,14.6268,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,128.47,2027.45,0.0 +gfx950,256,16,7168,16384,21,0,26.4865,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,141.89,2230.6,0.0 +gfx950,256,16,7168,18432,21,0,29.2062,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,144.76,2274.76,0.0 +gfx950,256,16,8192,1024,21,0,5.453,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,49.23,818.75,0.0 +gfx950,256,16,8192,2048,21,0,7.2534,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,74.02,1194.91,0.0 +gfx950,256,16,8192,3584,21,0,8.9059,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,105.49,1681.01,0.0 +gfx950,256,16,8192,4096,21,0,8.6726,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,123.81,1968.51,0.0 +gfx950,256,16,8192,7168,21,0,16.4747,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,114.06,1801.53,0.0 +gfx950,256,16,8192,8192,21,0,14.8938,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,144.19,2274.91,0.0 +gfx950,256,16,8192,28672,21,0,46.3894,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,162.02,2542.22,0.0 +gfx950,256,16,9216,16384,21,0,27.5351,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,175.48,2757.33,0.0 +gfx950,256,16,10240,8192,21,0,15.5584,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,172.53,2721.12,0.0 +gfx950,256,16,12288,512,21,0,4.4122,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,45.63,803.01,0.0 +gfx950,256,16,12288,1536,21,0,6.3038,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,95.81,1561.39,0.0 +gfx950,256,16,12288,4096,42,0,10.1125,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,159.27,2530.71,0.0 +gfx950,256,16,12288,6144,21,0,12.4264,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,194.42,3073.38,0.0 +gfx950,256,16,12800,5120,21,0,15.1771,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,138.18,2188.73,0.0 +gfx950,256,16,13312,16384,21,0,29.6788,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,235.16,3693.17,0.0 +gfx950,256,16,14336,8192,21,0,18.0277,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,208.46,3286.31,0.0 +gfx950,256,16,16384,512,21,0,4.4994,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,59.66,1049.63,0.0 +gfx950,256,16,16384,2048,21,0,6.429,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,167.02,2693.71,0.0 +gfx950,256,16,16384,4096,21,0,13.7825,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,155.81,2474.99,0.0 +gfx950,256,16,16384,6656,29,0,18.0628,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,193.2,3050.66,0.0 +gfx950,256,16,16384,8192,21,0,20.1001,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,213.68,3368.08,0.0 +gfx950,256,16,16384,13312,37,0,26.8849,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,259.6,4079.71,0.0 +gfx950,256,16,16384,16384,21,0,30.6345,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,280.4,4402.65,0.0 +gfx950,256,16,16384,26624,37,0,49.6732,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,281.01,4405.62,0.0 +gfx950,256,16,16384,53248,21,0,91.0136,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,306.74,4803.21,0.0 +gfx950,256,16,18432,7168,21,0,15.5712,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,271.52,4284.03,0.0 +gfx950,256,16,18432,16384,22,0,60.0972,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,160.8,2524.51,0.0 +gfx950,256,16,20480,16384,21,0,33.3906,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,321.57,5048.09,0.0 +gfx950,256,16,24576,1536,21,0,6.4818,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,186.36,3035.13,0.0 +gfx950,256,16,26624,16384,42,0,45.9082,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,304.06,4772.28,0.0 +gfx950,256,16,28672,4096,21,0,14.2242,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,264.2,4195.0,0.0 +gfx950,256,16,32768,512,21,0,5.0649,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,106.0,1864.06,0.0 +gfx950,256,16,51200,5120,14,0,24.0294,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,349.1,5524.54,0.0 +gfx950,256,16,53248,16384,12,0,74.0858,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,376.82,5912.64,0.0 +gfx950,256,16,57344,8192,14,0,44.8301,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,335.32,5281.75,0.0 +gfx950,256,16,59136,8192,14,0,47.0904,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,329.2,5185.32,0.0 +gfx950,256,16,106496,16384,17,0,145.7428,a4w4_blockscale_256x32x512x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,383.1,6010.27,0.0 +gfx950,256,32,512,4096,21,0,5.9227,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.66,193.64,0.0 +gfx950,256,32,512,7168,4,0,17.0396,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,13.78,116.34,0.0 +gfx950,256,32,800,5120,22,0,21.989,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,11.92,99.19,0.0 +gfx950,256,32,1024,3072,29,0,6.3604,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,31.65,265.32,0.0 +gfx950,256,32,1024,4096,29,0,7.9175,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,33.9,281.43,0.0 +gfx950,256,32,1280,8192,21,0,16.8907,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,39.73,323.01,0.0 +gfx950,256,32,1536,3072,29,0,7.4775,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,40.39,335.24,0.0 +gfx950,256,32,1536,7168,21,0,13.9479,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,50.52,409.96,0.0 +gfx950,256,32,2048,6144,29,0,12.2392,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,65.8,532.78,0.0 +gfx950,256,32,2048,7168,21,0,14.1868,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,66.23,534.71,0.0 +gfx950,256,32,2304,16384,21,0,25.8248,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,93.55,746.72,0.0 +gfx950,256,32,2560,8192,29,0,17.301,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,77.58,623.12,0.0 +gfx950,256,32,3072,1536,29,0,6.1627,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,49.0,418.73,0.0 +gfx950,256,32,3072,6144,21,0,12.2126,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,98.91,796.89,0.0 +gfx950,256,32,4096,1024,14,0,5.3091,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,50.56,447.47,0.0 +gfx950,256,32,4096,4096,21,0,10.0025,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,107.35,871.41,0.0 +gfx950,256,32,4096,8192,21,0,14.4168,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,148.96,1191.0,0.0 +gfx950,256,32,4096,14336,21,0,24.1615,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,155.54,1235.5,0.0 +gfx950,256,32,4608,7168,0,0,22.2165,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,95.15,761.81,0.0 +gfx950,256,32,4608,16384,21,0,27.3808,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,176.47,1399.0,0.0 +gfx950,256,32,5120,1280,29,0,6.1505,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,68.19,589.38,0.0 +gfx950,256,32,5120,5120,29,0,14.5083,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,115.64,931.66,0.0 +gfx950,256,32,5120,6400,29,0,16.8268,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,124.63,999.24,0.0 +gfx950,256,32,5120,25600,21,0,42.0705,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,199.39,1575.29,0.0 +gfx950,256,32,6144,2048,29,0,6.6444,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,121.2,1010.99,0.0 +gfx950,256,32,6144,3072,29,0,9.4195,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,128.24,1048.84,0.0 +gfx950,256,32,6144,4096,46,0,10.1863,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,158.12,1280.31,0.0 +gfx950,256,32,6144,12288,21,0,21.2506,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,227.37,1804.12,0.0 +gfx950,256,32,6144,16384,21,0,27.663,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,232.89,1843.15,0.0 +gfx950,256,32,6400,5120,21,0,15.6923,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,133.64,1075.4,0.0 +gfx950,256,32,7168,2048,42,0,6.821,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,137.74,1148.15,0.0 +gfx950,256,32,7168,2304,0,0,9.7787,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,108.09,895.12,0.0 +gfx950,256,32,7168,8192,21,0,15.6065,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,240.8,1919.07,0.0 +gfx950,256,32,7168,16384,21,0,28.1831,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,266.69,2109.11,0.0 +gfx950,256,32,7168,18432,21,0,31.3603,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,269.63,2130.53,0.0 +gfx950,256,32,8192,1024,21,0,5.8783,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,91.33,805.5,0.0 +gfx950,256,32,8192,2048,21,0,7.2694,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,147.71,1230.59,0.0 +gfx950,256,32,8192,3584,21,0,9.4337,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,199.18,1617.78,0.0 +gfx950,256,32,8192,4096,29,0,9.5391,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,225.12,1820.62,0.0 +gfx950,256,32,8192,7168,21,0,18.6983,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,200.99,1604.38,0.0 +gfx950,256,32,8192,8192,21,0,15.8689,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,270.65,2155.78,0.0 +gfx950,256,32,8192,28672,21,0,50.1679,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,299.64,2360.54,0.0 +gfx950,256,32,9216,16384,21,0,29.3716,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,329.01,2599.43,0.0 +gfx950,256,32,10240,8192,21,0,16.5466,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,324.46,2582.37,0.0 +gfx950,256,32,12288,512,21,0,4.5819,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,87.88,859.98,0.0 +gfx950,256,32,12288,1536,21,0,6.3551,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,190.08,1612.59,0.0 +gfx950,256,32,12288,4096,21,0,9.7836,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,329.25,2659.33,0.0 +gfx950,256,32,12288,6144,21,0,13.449,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,359.27,2872.59,0.0 +gfx950,256,32,12800,5120,21,0,17.2457,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,243.21,1952.32,0.0 +gfx950,256,32,13312,16384,21,0,32.0127,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,436.03,3441.32,0.0 +gfx950,256,32,14336,8192,29,0,18.6847,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,402.26,3198.81,0.0 +gfx950,256,32,16384,512,21,0,4.6435,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,115.62,1130.84,0.0 +gfx950,256,32,16384,2048,21,0,6.8106,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,315.31,2622.17,0.0 +gfx950,256,32,16384,4096,21,0,13.3393,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,321.98,2598.98,0.0 +gfx950,256,32,16384,6656,21,0,18.0028,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,387.68,3092.91,0.0 +gfx950,256,32,16384,8192,21,0,20.7777,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,413.42,3286.63,0.0 +gfx950,256,32,16384,13312,21,0,27.8363,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,501.45,3962.94,0.0 +gfx950,256,32,16384,16384,21,0,33.0522,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,519.78,4100.44,0.0 +gfx950,256,32,16384,26624,29,0,51.4954,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,542.13,4264.04,0.0 +gfx950,256,32,16384,53248,21,0,96.3502,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,579.5,4547.04,0.0 +gfx950,256,32,18432,7168,21,0,17.1967,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,491.71,3916.72,0.0 +gfx950,256,32,18432,16384,22,0,63.2576,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,305.53,2409.78,0.0 +gfx950,256,32,20480,16384,29,0,35.5786,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,603.59,4759.74,0.0 +gfx950,256,32,24576,1536,21,0,6.3941,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,377.84,3201.67,0.0 +gfx950,256,32,26624,16384,37,0,44.8726,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,622.15,4904.33,0.0 +gfx950,256,32,28672,4096,21,0,14.8004,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,507.84,4095.89,0.0 +gfx950,256,32,32768,512,43,0,5.7265,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,187.5,1832.52,0.0 +gfx950,256,32,51200,5120,14,0,25.3094,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,662.88,5311.49,0.0 +gfx950,256,32,53248,16384,12,0,75.1943,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,742.54,5849.88,0.0 +gfx950,256,32,57344,8192,14,0,45.7848,a4w4_blockscale_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,656.65,5213.13,0.0 +gfx950,256,32,59136,8192,15,0,47.9633,a4w4_blockscale_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,646.42,5131.77,0.0 +gfx950,256,32,106496,16384,17,0,142.9821,a4w4_blockscale_256x32x512x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,781.0,6151.07,0.0 +gfx950,256,64,192,1024,21,0,3.8719,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.5,40.2,0.0 +gfx950,256,64,512,4096,21,0,6.2843,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,42.72,198.14,0.0 +gfx950,256,64,512,7168,4,0,18.2008,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,25.81,117.02,0.0 +gfx950,256,64,800,5120,22,0,25.8477,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,20.28,89.53,0.0 +gfx950,256,64,1024,3072,21,0,6.3028,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,63.88,285.94,0.0 +gfx950,256,64,1024,4096,21,0,8.9889,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,59.73,262.47,0.0 +gfx950,256,64,1280,8192,21,0,17.9567,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,74.75,315.7,0.0 +gfx950,256,64,1536,3072,21,0,6.9252,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,87.21,383.27,0.0 +gfx950,256,64,1536,7168,21,0,13.372,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,105.39,443.54,0.0 +gfx950,256,64,2048,6144,21,0,11.8464,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,135.96,569.81,0.0 +gfx950,256,64,2048,7168,21,0,13.7284,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,136.87,570.46,0.0 +gfx950,256,64,2304,16384,21,0,25.9264,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,186.37,759.6,0.0 +gfx950,256,64,2560,8192,21,0,17.5991,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,152.53,629.33,0.0 +gfx950,256,64,3072,1536,21,0,6.149,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,98.22,455.63,0.0 +gfx950,256,64,3072,6144,21,0,11.8707,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,203.52,844.69,0.0 +gfx950,256,64,4096,1024,21,0,5.4105,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,99.23,490.57,0.0 +gfx950,256,64,4096,4096,29,0,9.9442,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,215.95,909.47,0.0 +gfx950,256,64,4096,8192,21,0,14.0415,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,305.88,1250.84,0.0 +gfx950,256,64,4096,14336,21,0,23.3334,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,322.12,1300.42,0.0 +gfx950,256,64,4608,7168,0,0,23.1497,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,182.63,748.79,0.0 +gfx950,256,64,4608,16384,21,0,26.8437,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,360.0,1447.75,0.0 +gfx950,256,64,5120,1280,21,0,6.7463,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,124.34,588.93,0.0 +gfx950,256,64,5120,5120,29,0,13.4294,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,249.86,1037.01,0.0 +gfx950,256,64,5120,6400,37,0,15.43,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,271.83,1117.57,0.0 +gfx950,256,64,5120,25600,21,0,40.7807,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,411.4,1643.19,0.0 +gfx950,256,64,6144,2048,21,0,6.6403,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,242.55,1075.77,0.0 +gfx950,256,64,6144,3072,21,0,8.6835,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,278.22,1188.68,0.0 +gfx950,256,64,6144,4096,29,0,9.8189,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,328.06,1374.94,0.0 +gfx950,256,64,6144,12288,21,0,20.6497,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,467.98,1885.18,0.0 +gfx950,256,64,6144,16384,21,0,27.112,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,475.25,1904.78,0.0 +gfx950,256,64,6400,5120,21,0,15.5961,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,268.93,1113.55,0.0 +gfx950,256,64,7168,2048,21,0,6.8112,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,275.88,1221.97,0.0 +gfx950,256,64,7168,2304,0,0,10.2539,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,206.16,901.98,0.0 +gfx950,256,64,7168,8192,21,0,15.3196,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,490.63,1993.51,0.0 +gfx950,256,64,7168,16384,21,0,27.6557,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,543.55,2175.39,0.0 +gfx950,256,64,7168,18432,21,0,30.5745,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,553.12,2209.93,0.0 +gfx950,256,64,8192,1024,21,0,5.6831,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,188.94,928.3,0.0 +gfx950,256,64,8192,2048,21,0,7.7271,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,277.92,1229.79,0.0 +gfx950,256,64,8192,3584,21,0,9.2243,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,407.41,1717.56,0.0 +gfx950,256,64,8192,4096,29,0,9.9642,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,431.04,1802.14,0.0 +gfx950,256,64,8192,7168,37,0,17.772,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,422.92,1723.95,0.0 +gfx950,256,64,8192,8192,21,0,15.5361,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,552.9,2244.14,0.0 +gfx950,256,64,8192,28672,21,0,48.3134,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,622.29,2471.5,0.0 +gfx950,256,64,9216,16384,21,0,29.2875,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,659.92,2635.98,0.0 +gfx950,256,64,10240,8192,21,0,16.349,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,656.76,2661.69,0.0 +gfx950,256,64,12288,512,21,0,4.5808,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,175.8,1033.66,0.0 +gfx950,256,64,12288,1536,21,0,6.3268,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,381.85,1747.99,0.0 +gfx950,256,64,12288,4096,21,0,10.0,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,644.25,2686.98,0.0 +gfx950,256,64,12288,6144,21,0,13.4251,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,719.82,2943.61,0.0 +gfx950,256,64,12800,5120,21,0,16.7255,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,501.55,2066.92,0.0 +gfx950,256,64,13312,16384,21,0,35.8494,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,778.74,3104.1,0.0 +gfx950,256,64,14336,8192,21,0,18.4161,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,816.26,3302.4,0.0 +gfx950,256,64,16384,512,21,0,5.3495,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,200.72,1179.15,0.0 +gfx950,256,64,16384,2048,21,0,6.8013,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,631.49,2784.75,0.0 +gfx950,256,64,16384,4096,21,0,14.4039,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,596.36,2484.23,0.0 +gfx950,256,64,16384,6656,21,0,18.7155,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,745.83,3036.85,0.0 +gfx950,256,64,16384,8192,21,0,20.7077,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,829.64,3354.7,0.0 +gfx950,256,64,16384,13312,21,0,32.0337,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,871.5,3483.05,0.0 +gfx950,256,64,16384,16384,21,0,32.8484,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1046.01,4165.78,0.0 +gfx950,256,64,16384,26624,21,0,51.1923,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1090.68,4318.09,0.0 +gfx950,256,64,16384,53248,21,0,96.564,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1156.43,4556.65,0.0 +gfx950,256,64,18432,7168,29,0,20.1172,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,840.65,3412.45,0.0 +gfx950,256,64,18432,16384,22,0,63.2113,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,611.52,2434.35,0.0 +gfx950,256,64,20480,16384,29,0,41.0443,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1046.42,4164.23,0.0 +gfx950,256,64,24576,1536,29,0,7.7228,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,625.66,2857.67,0.0 +gfx950,256,64,26624,16384,29,0,46.4119,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1203.02,4784.03,0.0 +gfx950,256,64,28672,4096,29,0,16.0086,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,939.02,3905.48,0.0 +gfx950,256,64,32768,512,46,0,5.601,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,383.41,2249.47,0.0 +gfx950,256,64,51200,5120,0,0,28.5654,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1174.65,4823.65,0.0 +gfx950,256,64,53248,16384,0,0,81.0859,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1377.17,5470.1,0.0 +gfx950,256,64,57344,8192,12,0,48.14,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,1249.06,5037.04,0.0 +gfx950,256,64,59136,8192,1,0,50.0197,a4w4_blockscale_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1239.68,4999.08,0.0 +gfx950,256,64,106496,16384,3,0,150.9403,a4w4_blockscale_256x64x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1479.65,5873.65,0.0 +gfx950,256,65,1280,8192,0,0,23.9233,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,56.98,237.24,0.0 +gfx950,256,112,5120,1280,21,0,6.6107,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,222.07,680.01,0.0 +gfx950,256,112,5120,5120,21,0,14.0483,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,417.99,1035.06,0.0 +gfx950,256,112,5120,6400,29,0,14.1005,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,520.55,1268.7,0.0 +gfx950,256,112,5120,25600,21,0,38.6535,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,759.57,1762.23,0.0 +gfx950,256,112,6400,5120,21,0,14.4899,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,506.56,1249.44,0.0 +gfx950,256,112,8192,7168,29,0,17.7127,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,742.59,1783.84,0.0 +gfx950,256,112,12800,5120,37,0,17.1116,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,857.9,2099.27,0.0 +gfx950,256,112,51200,5120,9,0,34.2442,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1714.75,4170.85,0.0 +gfx950,256,127,1280,8192,0,0,24.5329,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,108.56,248.16,0.0 +gfx950,256,128,128,49920,21,0,72.1898,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,22.66,88.97,0.0 +gfx950,256,128,128,322816,21,0,446.2108,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,23.71,92.68,0.0 +gfx950,256,128,128,423168,21,0,583.2756,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,23.77,92.92,0.0 +gfx950,256,128,256,256,29,0,3.4878,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,4.81,32.88,0.0 +gfx950,256,128,256,1024,29,0,4.1218,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,16.28,63.6,0.0 +gfx950,256,128,512,4096,21,0,6.4995,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,82.6,221.83,0.0 +gfx950,256,128,512,7168,0,0,22.7857,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,41.23,106.42,0.0 +gfx950,256,128,800,5120,24,0,32.7867,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,31.98,78.71,0.0 +gfx950,256,128,1024,3072,21,0,6.4269,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,125.3,316.11,0.0 +gfx950,256,128,1024,4096,21,0,8.7663,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,122.49,299.04,0.0 +gfx950,256,128,1280,8192,21,0,14.3547,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,187.0,424.59,0.0 +gfx950,256,128,1536,3072,21,0,6.826,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,176.96,432.04,0.0 +gfx950,256,128,1536,7168,21,0,13.0508,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,215.97,487.1,0.0 +gfx950,256,128,2048,6144,21,0,11.6,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,277.69,621.46,0.0 +gfx950,256,128,2048,7168,21,0,13.396,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,280.54,621.31,0.0 +gfx950,256,128,2304,16384,21,0,26.3592,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,366.61,778.2,0.0 +gfx950,256,128,2560,8192,21,0,17.4515,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,307.64,668.45,0.0 +gfx950,256,128,3072,1536,21,0,6.1683,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,195.83,525.92,0.0 +gfx950,256,128,3072,6144,21,0,11.183,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,432.07,949.37,0.0 +gfx950,256,128,4096,1024,21,0,5.3851,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,199.39,596.32,0.0 +gfx950,256,128,4096,4096,21,0,10.0526,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,427.25,964.86,0.0 +gfx950,256,128,4096,8192,21,0,13.7904,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,622.89,1330.64,0.0 +gfx950,256,128,4096,14336,21,0,22.8828,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,656.93,1368.98,0.0 +gfx950,256,128,4608,7168,0,0,23.5505,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,359.05,770.83,0.0 +gfx950,256,128,4608,16384,21,0,26.1969,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,737.77,1526.02,0.0 +gfx950,256,128,5120,1280,21,0,6.7817,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,247.39,688.54,0.0 +gfx950,256,128,5120,5120,21,0,13.8116,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,485.89,1067.62,0.0 +gfx950,256,128,5120,6400,21,0,15.856,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,529.05,1141.8,0.0 +gfx950,256,128,5120,25600,21,0,40.0938,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,836.9,1708.12,0.0 +gfx950,256,128,6144,2048,21,0,6.794,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,474.13,1176.83,0.0 +gfx950,256,128,6144,3072,21,0,9.1419,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,528.54,1225.86,0.0 +gfx950,256,128,6144,4096,21,0,10.2408,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,629.1,1407.89,0.0 +gfx950,256,128,6144,12288,21,0,20.3662,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,948.99,1969.34,0.0 +gfx950,256,128,6144,16384,21,0,26.9082,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,957.69,1967.92,0.0 +gfx950,256,128,6400,5120,21,0,14.762,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,568.26,1243.06,0.0 +gfx950,256,128,7168,2048,21,0,6.8857,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,545.78,1351.51,0.0 +gfx950,256,128,7168,2304,0,0,10.3883,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,406.98,985.72,0.0 +gfx950,256,128,7168,8192,21,0,15.244,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,986.12,2080.78,0.0 +gfx950,256,128,7168,16384,21,0,27.6111,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1088.87,2231.13,0.0 +gfx950,256,128,7168,18432,21,0,30.4024,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1112.51,2272.02,0.0 +gfx950,256,128,8192,1024,21,0,5.7823,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,371.39,1099.39,0.0 +gfx950,256,128,8192,2048,21,0,7.5231,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,570.9,1411.23,0.0 +gfx950,256,128,8192,3584,21,0,9.4332,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,796.78,1802.84,0.0 +gfx950,256,128,8192,4096,21,0,9.9528,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,863.07,1922.73,0.0 +gfx950,256,128,8192,7168,21,0,17.5883,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,854.68,1814.62,0.0 +gfx950,256,128,8192,8192,21,0,15.5661,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1103.67,2324.02,0.0 +gfx950,256,128,8192,28672,21,0,48.4668,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1240.63,2504.24,0.0 +gfx950,256,128,9216,16384,29,0,33.3105,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1160.44,2368.78,0.0 +gfx950,256,128,10240,8192,29,0,18.6592,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1150.9,2416.44,0.0 +gfx950,256,128,12288,512,29,0,5.1498,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,312.75,1228.05,0.0 +gfx950,256,128,12288,1536,37,0,6.7742,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,713.27,1871.99,0.0 +gfx950,256,128,12288,4096,29,0,11.2189,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1148.5,2546.92,0.0 +gfx950,256,128,12288,6144,29,0,15.5916,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1239.6,2648.07,0.0 +gfx950,256,128,12800,5120,0,0,18.1526,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,924.23,2003.71,0.0 +gfx950,256,128,13312,16384,29,0,38.1269,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1464.44,2977.12,0.0 +gfx950,256,128,14336,8192,29,0,21.2529,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1414.62,2960.28,0.0 +gfx950,256,128,16384,512,29,0,5.3611,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,400.57,1570.83,0.0 +gfx950,256,128,16384,2048,29,0,7.8866,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1089.18,2675.75,0.0 +gfx950,256,128,16384,4096,37,0,16.0283,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1071.85,2371.49,0.0 +gfx950,256,128,16384,6656,37,0,19.9661,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1398.23,2962.33,0.0 +gfx950,256,128,16384,8192,29,0,22.9523,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1497.01,3129.42,0.0 +gfx950,256,128,16384,13312,29,0,36.5127,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1529.18,3124.89,0.0 +gfx950,256,128,16384,16384,29,0,38.2318,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1797.44,3647.76,0.0 +gfx950,256,128,16384,26624,29,0,62.8972,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1775.42,3561.4,0.0 +gfx950,256,128,16384,53248,29,0,112.9144,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1977.94,3930.5,0.0 +gfx950,256,128,18432,7168,42,0,24.7415,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1367.05,2879.28,0.0 +gfx950,256,128,18432,16384,22,0,67.9262,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,1138.14,2307.83,0.0 +gfx950,256,128,20480,16384,42,0,49.9568,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1719.47,3484.28,0.0 +gfx950,256,128,24576,1536,8,0,9.4678,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1020.69,2668.43,0.0 +gfx950,256,128,26624,16384,42,0,55.1181,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2026.0,4099.71,0.0 +gfx950,256,128,28672,4096,0,0,18.1925,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1652.59,3645.59,0.0 +gfx950,256,128,32768,512,42,0,7.0236,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,611.51,2393.36,0.0 +gfx950,256,128,51200,5120,9,0,35.8651,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1871.15,4029.18,0.0 +gfx950,256,128,53248,16384,9,0,98.9807,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2256.38,4555.31,0.0 +gfx950,256,128,57344,8192,9,0,52.3976,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2295.13,4772.84,0.0 +gfx950,256,128,59136,8192,9,0,54.1038,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2292.21,4766.47,0.0 +gfx950,256,128,106496,16384,11,0,157.4506,a4w4_blockscale_256x128x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2836.93,5720.69,0.0 +gfx950,256,129,1280,8192,0,0,24.2013,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,111.78,252.11,0.0 +gfx950,256,160,1280,8192,21,0,16.0131,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,209.54,393.92,0.0 +gfx950,256,160,2304,16384,21,0,26.1975,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,461.1,798.64,0.0 +gfx950,256,160,2560,8192,21,0,16.6775,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,402.39,717.15,0.0 +gfx950,256,160,4608,16384,21,0,26.4423,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,913.66,1532.92,0.0 +gfx950,256,160,5120,1280,21,0,6.7723,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,309.67,740.9,0.0 +gfx950,256,160,5120,5120,21,0,14.1087,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,594.57,1074.17,0.0 +gfx950,256,160,5120,6400,21,0,16.2285,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,646.13,1142.09,0.0 +gfx950,256,160,5120,25600,21,0,39.9134,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1050.85,1734.31,0.0 +gfx950,256,160,6400,5120,21,0,15.0264,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,697.82,1253.9,0.0 +gfx950,256,160,7168,8192,29,0,18.6767,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1006.09,1729.92,0.0 +gfx950,256,160,8192,1024,29,0,6.5603,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,409.18,1051.43,0.0 +gfx950,256,160,8192,2048,29,0,7.7321,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,694.34,1445.13,0.0 +gfx950,256,160,8192,3584,29,0,13.3701,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,702.71,1315.49,0.0 +gfx950,256,160,8192,7168,37,0,19.1789,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,979.75,1697.44,0.0 +gfx950,256,160,8192,8192,29,0,17.5633,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1222.71,2097.06,0.0 +gfx950,256,160,8192,28672,29,0,54.6667,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1374.91,2238.21,0.0 +gfx950,256,160,9216,16384,29,0,31.4915,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1534.33,2532.66,0.0 +gfx950,256,160,10240,8192,29,0,19.5071,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1376.09,2351.72,0.0 +gfx950,256,160,12800,5120,37,0,17.8875,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1172.41,2083.78,0.0 +gfx950,256,160,13312,16384,37,0,39.7609,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1755.32,2882.79,0.0 +gfx950,256,160,14336,8192,37,0,22.5625,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1665.64,2834.93,0.0 +gfx950,256,160,16384,2048,37,0,9.3976,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1142.57,2360.6,0.0 +gfx950,256,160,16384,4096,42,0,18.124,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1184.88,2158.74,0.0 +gfx950,256,160,16384,6656,37,0,21.2181,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1644.66,2841.98,0.0 +gfx950,256,160,16384,8192,37,0,25.055,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1714.22,2913.87,0.0 +gfx950,256,160,16384,13312,37,0,36.0049,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1938.44,3204.0,0.0 +gfx950,256,160,16384,26624,37,0,67.3608,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2072.22,3347.3,0.0 +gfx950,256,160,26624,16384,46,0,60.9834,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2288.93,3737.64,0.0 +gfx950,256,160,51200,5120,47,0,42.9748,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,1951.98,3440.75,0.0 +gfx950,256,160,53248,16384,47,0,102.4961,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2723.74,4434.88,0.0 +gfx950,256,160,57344,8192,50,0,60.9337,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2467.01,4166.6,0.0 +gfx950,256,192,1280,8192,21,0,15.967,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,252.18,408.39,0.0 +gfx950,256,192,2304,16384,21,0,26.2998,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,551.16,811.11,0.0 +gfx950,256,192,2560,8192,21,0,16.9291,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,475.69,723.92,0.0 +gfx950,256,192,4608,16384,21,0,26.5132,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1093.46,1549.83,0.0 +gfx950,256,192,5120,1280,21,0,6.3153,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,398.49,849.64,0.0 +gfx950,256,192,5120,5120,21,0,14.2691,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,705.46,1090.8,0.0 +gfx950,256,192,5120,6400,21,0,17.0747,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,736.93,1110.68,0.0 +gfx950,256,192,5120,25600,21,0,39.9244,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1260.67,1752.3,0.0 +gfx950,256,192,6400,5120,29,0,17.6335,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,713.58,1096.39,0.0 +gfx950,256,192,7168,8192,29,0,18.7241,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1204.25,1757.04,0.0 +gfx950,256,192,8192,1024,29,0,6.5502,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,491.78,1135.59,0.0 +gfx950,256,192,8192,2048,29,0,8.5152,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,756.58,1377.65,0.0 +gfx950,256,192,8192,3584,29,0,12.1015,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,931.64,1501.45,0.0 +gfx950,256,192,8192,7168,29,0,18.3343,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1229.86,1810.49,0.0 +gfx950,256,192,8192,8192,29,0,18.9505,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1359.85,1978.13,0.0 +gfx950,256,192,8192,28672,29,0,56.8402,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1586.8,2169.92,0.0 +gfx950,256,192,9216,16384,29,0,33.7474,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1718.12,2388.61,0.0 +gfx950,256,192,10240,8192,29,0,19.2596,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1672.53,2422.77,0.0 +gfx950,256,192,12800,5120,37,0,18.4327,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1365.28,2071.03,0.0 +gfx950,256,192,13312,16384,37,0,41.6901,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2008.91,2776.12,0.0 +gfx950,256,192,14336,8192,37,0,23.7812,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1896.34,2733.74,0.0 +gfx950,256,192,16384,2048,37,0,11.3951,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1130.74,2041.69,0.0 +gfx950,256,192,16384,4096,46,0,18.5424,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1389.78,2170.11,0.0 +gfx950,256,192,16384,6656,37,0,21.9033,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1911.85,2805.8,0.0 +gfx950,256,192,16384,8192,37,0,25.9592,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1985.41,2857.82,0.0 +gfx950,256,192,16384,13312,37,0,38.9353,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2151.05,2995.26,0.0 +gfx950,256,192,16384,26624,37,0,69.5808,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2407.33,3261.69,0.0 +gfx950,256,192,26624,16384,49,0,66.9827,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2500.7,3432.23,0.0 +gfx950,256,192,51200,5120,54,0,46.5597,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2162.03,3247.97,0.0 +gfx950,256,192,53248,16384,50,0,112.2081,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2985.59,4083.73,0.0 +gfx950,256,192,57344,8192,50,0,63.3964,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2845.41,4064.7,0.0 +gfx950,256,256,512,4096,21,0,6.343,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,169.28,289.3,0.0 +gfx950,256,256,512,7168,0,0,22.6525,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,82.95,133.08,0.0 +gfx950,256,256,800,5120,22,0,36.8477,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,56.91,84.48,0.0 +gfx950,256,256,1024,3072,21,0,6.8291,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,235.85,364.67,0.0 +gfx950,256,256,1024,4096,21,0,9.0711,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,236.74,346.79,0.0 +gfx950,256,256,1280,8192,21,0,16.2987,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,329.39,426.22,0.0 +gfx950,256,256,1536,3072,21,0,6.9951,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,345.37,505.92,0.0 +gfx950,256,256,1536,7168,21,0,12.9576,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,435.05,556.35,0.0 +gfx950,256,256,2048,6144,21,0,11.5572,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,557.44,703.15,0.0 +gfx950,256,256,2048,7168,21,0,13.2113,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,568.92,704.41,0.0 +gfx950,256,256,2048,8192,21,0,13.9675,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,614.99,750.73,0.0 +gfx950,256,256,2304,16384,21,0,26.2627,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,735.92,843.45,0.0 +gfx950,256,256,2560,8192,21,0,16.8359,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,637.77,762.96,0.0 +gfx950,256,256,3072,1536,21,0,6.1771,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,391.11,668.4,0.0 +gfx950,256,256,3072,6144,21,0,11.1632,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,865.67,1056.73,0.0 +gfx950,256,256,4096,1024,21,0,5.5715,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,385.44,776.34,0.0 +gfx950,256,256,4096,4096,21,0,10.1458,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,846.65,1085.18,0.0 +gfx950,256,256,4096,8192,21,0,13.9413,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1232.3,1429.06,0.0 +gfx950,256,256,4096,14336,21,0,23.1245,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1300.13,1439.7,0.0 +gfx950,256,256,4608,7168,0,0,23.7653,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,711.6,832.81,0.0 +gfx950,256,256,4608,16384,29,0,30.6036,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1263.08,1379.09,0.0 +gfx950,256,256,5120,1280,29,0,6.8795,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,487.75,881.18,0.0 +gfx950,256,256,5120,5120,29,0,15.988,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,839.49,1024.77,0.0 +gfx950,256,256,5120,6400,29,0,14.7318,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1138.84,1345.7,0.0 +gfx950,256,256,5120,25600,29,0,47.1285,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1423.96,1515.73,0.0 +gfx950,256,256,6144,2048,29,0,7.1546,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,900.46,1355.68,0.0 +gfx950,256,256,6144,3072,37,0,9.9868,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,967.64,1299.33,0.0 +gfx950,256,256,6144,4096,29,0,10.2418,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1258.07,1586.92,0.0 +gfx950,256,256,6144,12288,29,0,24.2139,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1596.38,1753.84,0.0 +gfx950,256,256,6144,16384,29,0,31.4059,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1641.08,1769.56,0.0 +gfx950,256,256,6400,5120,29,0,17.2608,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,971.98,1177.01,0.0 +gfx950,256,256,7168,2048,29,0,7.8351,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,959.3,1438.68,0.0 +gfx950,256,256,7168,2304,0,0,10.6915,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,790.88,1143.19,0.0 +gfx950,256,256,7168,8192,29,0,17.9205,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1677.67,1901.66,0.0 +gfx950,256,256,7168,16384,29,0,32.5458,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1847.54,1981.44,0.0 +gfx950,256,256,7168,18432,29,0,36.2989,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1863.58,1986.0,0.0 +gfx950,256,256,8192,1024,29,0,6.5497,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,655.75,1300.77,0.0 +gfx950,256,256,8192,2048,29,0,8.1011,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1060.34,1585.59,0.0 +gfx950,256,256,8192,3584,29,0,14.0739,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1068.1,1373.69,0.0 +gfx950,256,256,8192,4096,29,0,10.3885,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1653.74,2069.19,0.0 +gfx950,256,256,8192,7168,29,0,18.8783,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1592.56,1826.01,0.0 +gfx950,256,256,8192,8192,29,0,18.4185,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1865.5,2106.43,0.0 +gfx950,256,256,8192,28672,29,0,58.3537,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2060.86,2147.33,0.0 +gfx950,256,256,9216,16384,37,0,36.7985,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2100.88,2236.86,0.0 +gfx950,256,256,10240,8192,37,0,21.2222,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2023.81,2272.83,0.0 +gfx950,256,256,12288,512,38,0,6.3583,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,506.62,1494.54,0.0 +gfx950,256,256,12288,1536,1,0,9.8009,a4w4_blockscale_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,986.0,1624.88,0.0 +gfx950,256,256,12288,4096,42,0,14.5442,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1771.83,2198.92,0.0 +gfx950,256,256,12288,6144,42,0,20.0605,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1926.91,2234.57,0.0 +gfx950,256,256,12800,5120,42,0,18.789,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1785.86,2127.68,0.0 +gfx950,256,256,13312,16384,42,0,46.7414,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2389.08,2523.78,0.0 +gfx950,256,256,14336,8192,42,0,26.5916,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2261.22,2523.69,0.0 +gfx950,256,256,16384,512,30,0,6.8027,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x256E,631.36,1859.33,0.0 +gfx950,256,256,16384,2048,42,0,10.4519,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1643.71,2432.86,0.0 +gfx950,256,256,16384,4096,38,0,19.0289,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,1805.66,2231.73,0.0 +gfx950,256,256,16384,6656,42,0,24.1571,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2311.31,2639.66,0.0 +gfx950,256,256,16384,8192,42,0,28.2495,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2432.59,2709.64,0.0 +gfx950,256,256,16384,13312,42,0,43.2786,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2580.24,2752.96,0.0 +gfx950,256,256,16384,16384,42,0,49.8604,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2756.48,2902.17,0.0 +gfx950,256,256,16384,26624,42,0,78.9684,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2828.2,2911.29,0.0 +gfx950,256,256,16384,53248,42,0,145.1528,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,3077.29,3109.91,0.0 +gfx950,256,256,18432,7168,38,0,30.4769,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2219.57,2507.31,0.0 +gfx950,256,256,18432,16384,2,0,97.5677,a4w4_blockscale_256x64x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1584.73,1665.81,0.0 +gfx950,256,256,20480,16384,38,0,62.6988,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2740.06,2876.53,0.0 +gfx950,256,256,24576,1536,8,0,12.7275,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1518.55,2487.05,0.0 +gfx950,256,256,26624,16384,43,0,73.3227,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3045.96,3189.09,0.0 +gfx950,256,256,28672,4096,43,0,24.4961,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2454.66,3017.81,0.0 +gfx950,256,256,32768,512,43,0,9.9975,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,859.21,2523.77,0.0 +gfx950,256,256,51200,5120,54,0,50.4785,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2658.91,3128.89,0.0 +gfx950,256,256,53248,16384,54,0,132.8006,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3363.51,3505.77,0.0 +gfx950,256,256,57344,8192,18,0,71.2376,a4w4_blockscale_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,3376.28,3724.01,0.0 +gfx950,256,256,59136,8192,18,0,74.1445,a4w4_blockscale_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,3345.28,3689.38,0.0 +gfx950,256,256,106496,16384,18,0,249.1001,a4w4_blockscale_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x8x1x32_8_2x4_intrawave_v3,3586.32,3729.58,0.0 +gfx950,256,288,1280,8192,21,0,16.4719,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,366.67,434.67,0.0 +gfx950,256,288,2304,16384,21,0,26.1796,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,830.54,861.77,0.0 +gfx950,256,288,2560,8192,21,0,14.3768,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,840.21,913.97,0.0 +gfx950,256,288,4608,16384,29,0,29.1664,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1490.98,1466.15,0.0 +gfx950,256,288,5120,1280,29,0,7.3211,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,515.62,875.58,0.0 +gfx950,256,288,5120,5120,29,0,13.8399,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1091.01,1213.42,0.0 +gfx950,256,288,5120,6400,42,0,18.5633,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1016.76,1091.12,0.0 +gfx950,256,288,5120,25600,29,0,44.3246,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1703.29,1628.25,0.0 +gfx950,256,288,6400,5120,29,0,16.8247,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1121.82,1236.73,0.0 +gfx950,256,288,7168,8192,37,0,21.1725,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1597.49,1637.43,0.0 +gfx950,256,288,8192,1024,37,0,7.5191,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,642.61,1204.98,0.0 +gfx950,256,288,8192,2048,42,0,10.6831,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,904.58,1254.52,0.0 +gfx950,256,288,8192,3584,42,0,14.8917,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1135.63,1337.31,0.0 +gfx950,256,288,8192,7168,37,0,19.5302,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1731.82,1797.78,0.0 +gfx950,256,288,8192,8192,37,0,21.6423,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1786.07,1822.94,0.0 +gfx950,256,288,8192,28672,37,0,66.796,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2025.44,1890.65,0.0 +gfx950,256,288,9216,16384,37,0,39.9597,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2176.52,2081.23,0.0 +gfx950,256,288,10240,8192,37,0,22.4527,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2152.01,2183.3,0.0 +gfx950,256,288,12800,5120,46,0,20.6536,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1827.71,1979.22,0.0 +gfx950,256,288,13312,16384,46,0,50.3598,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2494.6,2364.56,0.0 +gfx950,256,288,14336,8192,46,0,28.4269,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2379.64,2397.64,0.0 +gfx950,256,288,16384,2048,46,0,14.2595,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1355.4,1859.06,0.0 +gfx950,256,288,16384,4096,46,0,18.264,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2116.44,2386.19,0.0 +gfx950,256,288,16384,6656,46,0,25.5322,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2460.18,2542.73,0.0 +gfx950,256,288,16384,8192,46,0,30.4004,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2543.04,2556.73,0.0 +gfx950,256,288,16384,13312,46,0,45.1824,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2780.46,2664.89,0.0 +gfx950,256,288,16384,26624,46,0,83.936,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2993.42,2756.56,0.0 +gfx950,256,288,26624,16384,47,0,77.0326,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3261.68,3061.02,0.0 +gfx950,256,288,51200,5120,41,0,54.2684,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x640E,2782.37,2972.27,0.0 +gfx950,256,288,53248,16384,41,0,147.3597,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x640E,3410.1,3184.3,0.0 +gfx950,256,288,57344,8192,47,0,87.9704,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3075.84,3058.88,0.0 +gfx950,256,320,2560,8192,21,0,16.4166,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,817.57,818.37,0.0 +gfx950,256,320,5120,1280,29,0,7.4259,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,564.82,910.11,0.0 +gfx950,256,320,5120,5120,22,0,16.5703,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x256E,1012.49,1038.19,0.0 +gfx950,256,320,5120,6400,29,0,18.0264,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1163.38,1147.47,0.0 +gfx950,256,320,5120,25600,29,0,47.2784,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1774.3,1542.12,0.0 +gfx950,256,320,6400,5120,29,0,16.745,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1252.4,1271.97,0.0 +gfx950,256,320,7168,8192,37,0,20.6963,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1815.83,1703.61,0.0 +gfx950,256,320,8192,1024,37,0,7.6808,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,698.98,1250.0,0.0 +gfx950,256,320,8192,2048,37,0,8.971,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1196.9,1556.03,0.0 +gfx950,256,320,8192,3584,37,0,14.0025,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1341.94,1463.77,0.0 +gfx950,256,320,8192,7168,37,0,19.5769,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1919.66,1826.13,0.0 +gfx950,256,320,8192,8192,37,0,21.16,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2029.76,1895.46,0.0 +gfx950,256,320,8192,28672,37,0,70.3383,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2137.16,1809.41,0.0 +gfx950,256,320,10240,8192,42,0,24.4175,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2198.71,2039.82,0.0 +gfx950,256,320,14336,8192,46,0,30.445,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2468.78,2273.15,0.0 +gfx950,256,320,51200,5120,47,0,62.2725,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2694.16,2644.17,0.0 +gfx950,256,320,57344,8192,47,0,88.7921,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3385.97,3073.38,0.0 +gfx950,256,384,1280,8192,21,0,16.1226,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,499.49,483.72,0.0 +gfx950,256,384,2304,16384,21,0,26.2829,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1103.04,905.13,0.0 +gfx950,256,384,2560,8192,21,0,17.85,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,902.3,785.7,0.0 +gfx950,256,384,4608,16384,29,0,31.6982,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1829.19,1401.76,0.0 +gfx950,256,384,5120,1280,29,0,7.0087,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,718.13,1063.64,0.0 +gfx950,256,384,5120,5120,29,0,17.1449,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1174.27,1051.18,0.0 +gfx950,256,384,5120,6400,29,0,18.6953,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1346.1,1152.43,0.0 +gfx950,256,384,5120,25600,29,0,48.0141,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2096.54,1549.2,0.0 +gfx950,256,384,6400,5120,37,0,18.0754,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1392.27,1232.74,0.0 +gfx950,256,384,7168,8192,37,0,21.6494,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2083.07,1683.1,0.0 +gfx950,256,384,8192,1024,37,0,7.5727,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,850.75,1410.64,0.0 +gfx950,256,384,8192,2048,37,0,9.5408,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1350.51,1579.88,0.0 +gfx950,256,384,8192,3584,46,0,16.2207,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1390.11,1335.31,0.0 +gfx950,256,384,8192,7168,37,0,19.9731,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2257.89,1853.89,0.0 +gfx950,256,384,8192,8192,37,0,21.9654,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2346.4,1885.64,0.0 +gfx950,256,384,8192,28672,37,0,69.2615,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2604.46,1865.93,0.0 +gfx950,256,384,9216,16384,42,0,44.5812,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2601.19,1922.81,0.0 +gfx950,256,384,10240,8192,42,0,25.4148,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2534.92,2021.67,0.0 +gfx950,256,384,12800,5120,38,0,23.8257,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2112.49,1829.18,0.0 +gfx950,256,384,13312,16384,49,0,57.4449,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2915.9,2131.11,0.0 +gfx950,256,384,14336,8192,38,0,32.5471,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2771.19,2190.77,0.0 +gfx950,256,384,16384,2048,49,0,16.6778,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,1545.16,1784.01,0.0 +gfx950,256,384,16384,4096,38,0,20.4734,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2517.39,2291.94,0.0 +gfx950,256,384,16384,6656,38,0,29.552,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2834.05,2314.12,0.0 +gfx950,256,384,16384,8192,38,0,34.6005,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2979.13,2348.66,0.0 +gfx950,256,384,16384,13312,38,0,53.2326,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,3146.64,2332.98,0.0 +gfx950,256,384,16384,26624,49,0,95.913,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3492.83,2458.46,0.0 +gfx950,256,384,26624,16384,50,0,88.0544,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3804.55,2744.86,0.0 +gfx950,256,384,51200,5120,54,0,72.5351,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2775.57,2362.67,0.0 +gfx950,256,384,53248,16384,44,0,191.233,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3503.66,2511.32,0.0 +gfx950,256,384,57344,8192,10,0,105.5947,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,3416.62,2656.33,0.0 +gfx950,256,416,1280,8192,21,0,15.9344,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,547.5,502.8,0.0 +gfx950,256,512,512,4096,21,0,7.2567,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,295.93,361.24,0.0 +gfx950,256,512,512,7168,0,0,23.2885,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,161.37,180.1,0.0 +gfx950,256,512,800,5120,21,0,38.7761,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,108.17,107.74,0.0 +gfx950,256,512,1024,3072,21,0,7.2108,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,446.72,472.61,0.0 +gfx950,256,512,1280,8192,21,0,16.2544,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,660.59,532.21,0.0 +gfx950,256,512,1536,7168,21,0,12.9046,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,873.66,690.68,0.0 +gfx950,256,512,1792,7424,21,0,13.7827,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,988.42,753.66,0.0 +gfx950,256,512,2048,6144,21,0,11.6892,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,1102.29,852.19,0.0 +gfx950,256,512,2304,16384,29,0,30.5848,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1263.85,831.39,0.0 +gfx950,256,512,2560,8192,29,0,17.993,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1193.51,845.01,0.0 +gfx950,256,512,3072,1536,0,0,8.7335,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,553.25,675.36,0.0 +gfx950,256,512,3072,6144,29,0,13.6979,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1410.97,1033.43,0.0 +gfx950,256,512,4096,4096,29,0,10.0912,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1702.46,1350.83,0.0 +gfx950,256,512,4096,8192,29,0,16.949,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2027.24,1361.06,0.0 +gfx950,256,512,4096,14336,29,0,28.388,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,2118.13,1311.27,0.0 +gfx950,256,512,4608,7168,4,0,29.4314,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1149.21,783.81,0.0 +gfx950,256,512,4608,16384,37,0,35.2309,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2194.36,1324.45,0.0 +gfx950,256,512,5120,1280,37,0,8.2567,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,812.78,1071.54,0.0 +gfx950,256,512,5120,5120,37,0,17.3284,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1549.11,1134.6,0.0 +gfx950,256,512,5120,6400,37,0,18.7412,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1790.41,1241.4,0.0 +gfx950,256,512,5120,25600,37,0,58.227,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2305.08,1328.12,0.0 +gfx950,256,512,6144,3072,42,0,12.4658,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1550.43,1324.83,0.0 +gfx950,256,512,6144,4096,42,0,14.0105,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1839.32,1422.0,0.0 +gfx950,256,512,6144,12288,42,0,33.2974,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2321.79,1417.11,0.0 +gfx950,256,512,6144,16384,42,0,42.6302,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2417.99,1426.63,0.0 +gfx950,256,512,6400,5120,42,0,19.2804,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1740.34,1257.67,0.0 +gfx950,256,512,7168,2048,1,0,13.0804,a4w4_blockscale_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1149.23,1162.38,0.0 +gfx950,256,512,7168,2304,8,0,12.9047,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1310.49,1254.38,0.0 +gfx950,256,512,7168,8192,42,0,24.4684,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2457.44,1585.61,0.0 +gfx950,256,512,7168,16384,42,0,43.582,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2759.38,1612.01,0.0 +gfx950,256,512,7168,18432,42,0,49.9132,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2710.53,1565.1,0.0 +gfx950,256,512,8192,1024,42,0,9.0616,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,947.95,1417.53,0.0 +gfx950,256,512,8192,2048,46,0,13.7717,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1247.48,1256.31,0.0 +gfx950,256,512,8192,3584,46,0,16.8791,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1781.18,1421.06,0.0 +gfx950,256,512,8192,4096,42,0,14.6073,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2352.23,1794.61,0.0 +gfx950,256,512,8192,7168,42,0,22.9346,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2621.78,1725.94,0.0 +gfx950,256,512,8192,8192,42,0,25.2603,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2720.45,1743.45,0.0 +gfx950,256,512,8192,28672,42,0,77.5444,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,3101.68,1717.33,0.0 +gfx950,256,512,9216,16384,49,0,53.7146,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2878.53,1659.31,0.0 +gfx950,256,512,10240,8192,38,0,30.4085,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2824.85,1793.12,0.0 +gfx950,256,512,12288,512,43,0,9.5274,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,676.2,1664.64,0.0 +gfx950,256,512,12288,1536,8,0,12.1891,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1585.63,1838.8,0.0 +gfx950,256,512,12288,4096,43,0,20.8203,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2475.45,1863.44,0.0 +gfx950,256,512,12288,6144,43,0,28.591,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2703.98,1815.41,0.0 +gfx950,256,512,12800,5120,47,0,26.9162,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2493.25,1753.07,0.0 +gfx950,256,512,13312,16384,43,0,65.6367,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3402.64,1933.03,0.0 +gfx950,256,512,14336,8192,43,0,38.0196,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3163.08,1985.75,0.0 +gfx950,256,512,16384,512,53,0,10.2901,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,834.78,2050.77,0.0 +gfx950,256,512,16384,2048,43,0,14.9724,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2294.87,2276.1,0.0 +gfx950,256,512,16384,4096,43,0,22.894,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3001.64,2244.27,0.0 +gfx950,256,512,16384,6656,43,0,33.6862,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3314.98,2167.27,0.0 +gfx950,256,512,16384,8192,43,0,38.1581,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3601.83,2253.34,0.0 +gfx950,256,512,16384,13312,43,0,58.5844,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3812.25,2206.0,0.0 +gfx950,256,512,16384,16384,43,0,68.3178,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,4023.52,2271.58,0.0 +gfx950,256,512,16384,26624,43,0,109.2596,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,4088.21,2212.13,0.0 +gfx950,256,512,16384,53248,43,0,204.9689,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,4358.48,2276.52,0.0 +gfx950,256,512,18432,7168,50,0,41.6828,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3245.74,2081.67,0.0 +gfx950,256,512,20480,16384,50,0,86.5746,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3968.8,2228.57,0.0 +gfx950,256,512,24576,1536,10,0,18.1666,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2127.79,2445.88,0.0 +gfx950,256,512,26624,16384,54,0,104.3943,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4278.75,2390.56,0.0 +gfx950,256,512,28672,4096,54,0,36.5438,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3290.82,2438.96,0.0 +gfx950,256,512,32768,512,54,0,14.2158,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1208.51,2959.67,0.0 +gfx950,256,512,51200,5120,54,0,79.3531,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3382.8,2328.98,0.0 +gfx950,256,512,53248,16384,54,0,206.5008,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4326.15,2396.74,0.0 +gfx950,256,512,57344,8192,54,0,114.8863,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4187.06,2573.84,0.0 +gfx950,256,512,59136,8192,54,0,120.2706,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4124.61,2534.9,0.0 +gfx950,256,512,106496,16384,54,0,393.8495,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4536.52,2502.63,0.0 +gfx950,256,640,256,768,21,0,4.4726,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,56.27,150.19,0.0 +gfx950,256,640,1280,8192,21,0,16.1806,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,829.5,587.29,0.0 +gfx950,256,640,2304,16384,29,0,31.0486,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1556.22,871.74,0.0 +gfx950,256,640,2560,8192,29,0,18.3344,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1464.11,893.62,0.0 +gfx950,256,640,4608,16384,37,0,37.6937,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2563.74,1297.03,0.0 +gfx950,256,640,5120,1280,30,0,9.1065,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x256E,921.17,1124.47,0.0 +gfx950,256,640,5120,5120,42,0,18.2902,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1834.56,1164.51,0.0 +gfx950,256,640,5120,6400,42,0,20.0998,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2086.74,1243.08,0.0 +gfx950,256,640,5120,25600,42,0,65.8701,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2547.02,1218.79,0.0 +gfx950,256,640,6400,5120,42,0,19.3128,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2171.77,1357.36,0.0 +gfx950,256,640,7168,8192,46,0,28.4912,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2638.08,1444.54,0.0 +gfx950,256,640,7936,540672,46,0,1458.2397,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,3766.31,1596.83,0.0 +gfx950,256,640,8192,1024,13,0,9.4548,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,1135.66,1587.31,0.0 +gfx950,256,640,8192,2048,46,0,15.5769,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,1378.63,1253.76,0.0 +gfx950,256,640,8192,3584,38,0,18.5675,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2024.02,1417.14,0.0 +gfx950,256,640,8192,7168,46,0,26.2059,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2868.13,1608.02,0.0 +gfx950,256,640,8192,8192,46,0,29.5225,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2909.62,1580.54,0.0 +gfx950,256,640,8192,28672,46,0,88.2533,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,3406.65,1553.5,0.0 +gfx950,256,640,9216,16384,38,0,61.2521,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,3155.38,1510.75,0.0 +gfx950,256,640,10240,8192,51,0,34.6221,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,3101.32,1665.75,0.0 +gfx950,256,640,12800,5120,43,0,29.0678,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2885.88,1747.31,0.0 +gfx950,256,640,13312,16384,47,0,74.153,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3764.82,1771.12,0.0 +gfx950,256,640,14336,8192,47,0,42.5891,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3529.63,1871.18,0.0 +gfx950,256,640,16384,2048,47,0,19.1148,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2246.93,2009.13,0.0 +gfx950,256,640,16384,4096,47,0,26.5896,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3230.56,2099.94,0.0 +gfx950,256,640,16384,6656,47,0,38.105,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3663.21,2037.2,0.0 +gfx950,256,640,16384,8192,47,0,45.9881,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3735.72,1972.29,0.0 +gfx950,256,640,16384,13312,47,0,69.754,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4002.25,1925.1,0.0 +gfx950,256,640,16384,26624,47,0,119.6467,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4666.62,2069.38,0.0 +gfx950,256,640,26624,16384,47,0,143.1188,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3901.27,1798.68,0.0 +gfx950,256,640,51200,5120,45,0,101.0597,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3320.26,1961.68,0.0 +gfx950,256,640,53248,16384,48,0,279.1174,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4000.79,1825.78,0.0 +gfx950,256,640,57344,8192,43,0,162.3674,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3703.3,1914.81,0.0 +gfx950,256,768,1280,8192,21,0,16.8204,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,957.54,615.6,0.0 +gfx950,256,768,2304,16384,29,0,31.7907,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1823.87,902.93,0.0 +gfx950,256,768,2560,8192,29,0,18.2695,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1763.17,961.36,0.0 +gfx950,256,768,4608,16384,42,0,42.9379,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2700.74,1190.51,0.0 +gfx950,256,768,5120,1280,30,0,9.2191,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x256E,1091.9,1261.8,0.0 +gfx950,256,768,5120,5120,42,0,18.7867,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2143.29,1220.95,0.0 +gfx950,256,768,5120,6400,42,0,20.7759,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2422.6,1285.43,0.0 +gfx950,256,768,5120,25600,42,0,67.9876,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2961.23,1224.2,0.0 +gfx950,256,768,6400,5120,46,0,21.2145,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2372.51,1328.36,0.0 +gfx950,256,768,7168,8192,49,0,31.7305,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2842.51,1371.42,0.0 +gfx950,256,768,8192,1024,13,0,11.9823,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8_2x2_intrawave_v3,1075.33,1432.98,0.0 +gfx950,256,768,8192,2048,49,0,16.0083,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,1609.78,1359.17,0.0 +gfx950,256,768,8192,3584,49,0,18.1697,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2482.0,1576.21,0.0 +gfx950,256,768,8192,7168,49,0,29.5435,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3052.93,1512.87,0.0 +gfx950,256,768,8192,8192,49,0,32.5122,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3170.48,1515.83,0.0 +gfx950,256,768,8192,28672,49,0,97.7048,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3692.52,1443.47,0.0 +gfx950,256,768,9216,16384,43,0,65.8922,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3519.81,1456.09,0.0 +gfx950,256,768,10240,8192,43,0,37.1829,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3465.28,1635.63,0.0 +gfx950,256,768,12800,5120,47,0,32.7883,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3070.1,1658.97,0.0 +gfx950,256,768,13312,16384,52,0,85.5076,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3917.87,1588.05,0.0 +gfx950,256,768,14336,8192,50,0,49.5292,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3642.07,1693.67,0.0 +gfx950,256,768,16384,2048,50,0,20.4289,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2522.88,2091.62,0.0 +gfx950,256,768,16384,4096,50,0,30.2755,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3404.71,1991.48,0.0 +gfx950,256,768,16384,6656,50,0,45.6941,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3665.76,1799.96,0.0 +gfx950,256,768,16384,8192,50,0,52.9874,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3890.71,1800.81,0.0 +gfx950,256,768,16384,13312,52,0,77.5727,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4318.63,1796.12,0.0 +gfx950,256,768,16384,26624,50,0,138.9553,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4821.8,1824.28,0.0 +gfx950,256,768,26624,16384,50,0,165.2907,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4053.55,1604.99,0.0 +gfx950,256,768,51200,5120,54,0,111.9765,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3595.87,1890.41,0.0 +gfx950,256,768,53248,16384,54,0,289.4301,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4629.89,1811.45,0.0 +gfx950,256,768,57344,8192,54,0,167.4114,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4310.07,1947.94,0.0 +gfx950,256,768,547328,2048,54,0,526.8722,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3267.86,2660.88,0.0 +gfx950,256,832,1280,8192,29,0,18.4303,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,946.72,584.94,0.0 +gfx950,256,864,1280,8192,29,0,18.3432,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,987.8,599.33,0.0 +gfx950,256,896,13184,53504,44,0,276.8915,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4565.22,1445.67,0.0 +gfx950,256,1024,512,4096,21,0,9.3769,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,458.04,447.3,0.0 +gfx950,256,1024,512,7168,0,0,24.5597,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,306.04,266.84,0.0 +gfx950,256,1024,800,5120,23,0,46.2669,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,181.31,136.34,0.0 +gfx950,256,1024,1024,3072,21,0,9.0159,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,714.57,581.51,0.0 +gfx950,256,1024,1280,8192,29,0,18.2932,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1173.92,659.19,0.0 +gfx950,256,1024,1536,7168,29,0,15.7978,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1427.32,779.9,0.0 +gfx950,256,1024,2048,6144,29,0,14.261,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1807.01,955.86,0.0 +gfx950,256,1024,2304,16384,37,0,36.3864,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2124.68,878.94,0.0 +gfx950,256,1024,2560,8192,37,0,21.1445,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2031.25,942.23,0.0 +gfx950,256,1024,3072,1536,8,0,10.7099,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,902.31,881.16,0.0 +gfx950,256,1024,3072,6144,42,0,19.4604,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,1986.33,969.89,0.0 +gfx950,256,1024,4096,4096,42,0,14.6518,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2345.09,1288.19,0.0 +gfx950,256,1024,4096,8192,42,0,24.7357,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2778.15,1186.95,0.0 +gfx950,256,1024,4096,14336,42,0,41.0531,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2929.35,1098.3,0.0 +gfx950,256,1024,4608,7168,5,0,42.7675,a4w4_blockscale_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1581.71,692.64,0.0 +gfx950,256,1024,4608,16384,46,0,49.665,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,3113.24,1118.99,0.0 +gfx950,256,1024,5120,1280,38,0,10.8223,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,1240.2,1332.24,0.0 +gfx950,256,1024,5120,5120,49,0,23.1664,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2317.46,1131.57,0.0 +gfx950,256,1024,5120,6400,49,0,25.3882,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2643.31,1187.42,0.0 +gfx950,256,1024,5120,25600,49,0,84.2335,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3186.8,1058.12,0.0 +gfx950,256,1024,6144,3072,51,0,17.8126,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2170.08,1324.51,0.0 +gfx950,256,1024,6144,4096,51,0,20.4317,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2522.53,1334.35,0.0 +gfx950,256,1024,6144,12288,51,0,49.6008,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,3117.26,1141.58,0.0 +gfx950,256,1024,6144,16384,43,0,61.8487,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3333.27,1152.86,0.0 +gfx950,256,1024,6400,5120,51,0,26.509,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2531.55,1211.39,0.0 +gfx950,256,1024,7168,2048,8,0,19.98,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1504.74,1154.59,0.0 +gfx950,256,1024,7168,2304,9,0,22.1345,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1528.06,1089.58,0.0 +gfx950,256,1024,7168,8192,43,0,36.9755,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3252.4,1304.5,0.0 +gfx950,256,1024,7168,16384,43,0,67.0567,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3586.79,1219.7,0.0 +gfx950,256,1024,7168,18432,53,0,75.1228,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,3601.88,1200.4,0.0 +gfx950,256,1024,8192,1024,8,0,14.3455,a4w4_blockscale_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1197.58,1498.44,0.0 +gfx950,256,1024,8192,2048,43,0,18.0551,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,1903.05,1451.91,0.0 +gfx950,256,1024,8192,3584,43,0,21.7992,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2758.34,1527.23,0.0 +gfx950,256,1024,8192,4096,43,0,21.6498,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3174.14,1646.74,0.0 +gfx950,256,1024,8192,7168,43,0,35.1762,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3418.76,1415.94,0.0 +gfx950,256,1024,8192,8192,53,0,38.144,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,3603.16,1429.48,0.0 +gfx950,256,1024,8192,28672,43,0,111.089,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,4330.19,1340.35,0.0 +gfx950,256,1024,9216,16384,47,0,73.6628,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4198.02,1395.01,0.0 +gfx950,256,1024,10240,8192,50,0,49.6385,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3461.0,1351.95,0.0 +gfx950,256,1024,10880,28416,54,0,149.9933,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4221.34,1276.15,0.0004 +gfx950,256,1024,12288,512,44,0,12.3061,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,1047.03,2321.91,0.0 +gfx950,256,1024,12288,1536,44,0,17.9317,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,2155.66,1973.57,0.0 +gfx950,256,1024,12288,4096,44,0,31.5117,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3271.14,1663.79,0.0 +gfx950,256,1024,12288,6144,44,0,43.5271,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3552.24,1517.68,0.0 +gfx950,256,1024,12800,5120,48,0,40.8113,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3288.74,1509.48,0.0 +gfx950,256,1024,13312,16384,54,0,97.7036,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4571.75,1481.05,0.0 +gfx950,256,1024,14336,8192,54,0,60.3453,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3985.7,1529.11,0.0 +gfx950,256,1024,16384,512,45,0,14.463,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1187.85,2628.15,0.0 +gfx950,256,1024,16384,2048,54,0,21.8036,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3151.75,2356.5,0.0 +gfx950,256,1024,16384,4096,54,0,34.1505,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4024.51,2026.5,0.0 +gfx950,256,1024,16384,6656,54,0,52.7494,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4233.95,1734.39,0.0 +gfx950,256,1024,16384,8192,54,0,59.9643,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4584.03,1748.67,0.0 +gfx950,256,1024,16384,13312,54,0,91.4984,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4881.8,1633.06,0.0 +gfx950,256,1024,16384,16384,54,0,111.7138,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4921.11,1576.89,0.0 +gfx950,256,1024,16384,26624,54,0,163.6572,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5458.69,1621.01,0.0 +gfx950,256,1024,16384,53248,54,0,330.6898,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5402.97,1502.99,0.0 +gfx950,256,1024,18432,7168,47,0,75.4356,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3586.94,1424.78,0.0 +gfx950,256,1024,20480,16384,50,0,165.8132,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4144.39,1315.36,0.0 +gfx950,256,1024,24576,1536,54,0,34.8437,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2218.75,2008.75,0.0 +gfx950,256,1024,26624,16384,54,0,189.7626,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4707.74,1480.89,0.0 +gfx950,256,1024,28672,4096,54,0,67.7295,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3551.16,1764.93,0.0 +gfx950,256,1024,32768,512,54,0,25.3249,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1356.76,2991.51,0.0 +gfx950,256,1024,51200,5120,48,0,143.817,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3733.01,1658.71,0.0 +gfx950,256,1024,53248,16384,54,0,375.89,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4753.27,1472.9,0.0 +gfx950,256,1024,57344,8192,54,0,212.8558,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4519.83,1674.92,0.0 +gfx950,256,1024,59136,8192,54,0,219.812,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4513.57,1672.0,0.0 +gfx950,256,1024,106496,16384,54,0,695.1664,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5140.37,1580.78,0.0 +gfx950,256,1152,1280,8192,29,0,18.0,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1342.18,717.26,0.0 +gfx950,256,1152,2304,16384,37,0,38.6497,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2250.29,869.86,0.0 +gfx950,256,1152,2560,8192,37,0,21.6324,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,2233.61,975.51,0.0 +gfx950,256,1152,4608,16384,49,0,54.2861,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3204.25,1064.78,0.0 +gfx950,256,1152,5120,1280,38,0,13.5937,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,1110.77,1163.08,0.0 +gfx950,256,1152,5120,5120,38,0,23.5739,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2562.07,1181.51,0.0 +gfx950,256,1152,5120,6400,49,0,26.6498,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2832.95,1195.76,0.0 +gfx950,256,1152,5120,25600,49,0,85.8163,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,3519.03,1072.97,0.0 +gfx950,256,1152,6400,5120,53,0,29.0565,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,2598.3,1172.84,0.0 +gfx950,256,1152,7168,8192,43,0,39.4724,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3427.5,1281.75,0.0 +gfx950,256,1152,8192,1024,47,0,14.4799,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,1334.77,1633.89,0.0 +gfx950,256,1152,8192,2048,47,0,17.1527,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2253.56,1658.2,0.0 +gfx950,256,1152,8192,3584,47,0,23.2214,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2913.08,1533.88,0.0 +gfx950,256,1152,8192,7168,47,0,38.4727,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3516.56,1361.05,0.0 +gfx950,256,1152,8192,8192,47,0,41.1237,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3759.85,1389.65,0.0 +gfx950,256,1152,8192,28672,47,0,121.8771,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4440.26,1253.97,0.0 +gfx950,256,1152,9216,16384,50,0,85.6876,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4060.01,1239.02,0.0 +gfx950,256,1152,10240,8192,50,0,50.9078,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3796.54,1380.04,0.0 +gfx950,256,1152,12800,5120,54,0,43.4897,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3471.97,1499.4,0.0 +gfx950,256,1152,13312,16384,45,0,119.9853,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4188.11,1243.15,0.0 +gfx950,256,1152,14336,8192,45,0,66.8811,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4045.73,1442.4,0.0 +gfx950,256,1152,16384,2048,47,0,30.2016,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2559.78,1844.46,0.0 +gfx950,256,1152,16384,4096,47,0,46.9601,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3292.56,1568.62,0.0 +gfx950,256,1152,16384,6656,47,0,69.2029,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3630.71,1388.79,0.0 +gfx950,256,1152,16384,8192,47,0,81.3708,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3800.35,1346.63,0.0 +gfx950,256,1152,16384,13312,47,0,123.7448,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4060.87,1248.28,0.0 +gfx950,256,1152,16384,26624,47,0,232.4817,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4323.02,1166.49,0.0 +gfx950,256,1152,26624,16384,45,0,226.9321,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4428.74,1272.99,0.0 +gfx950,256,1152,51200,5120,54,0,155.4048,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3886.49,1621.48,0.0 +gfx950,256,1152,53248,16384,54,0,451.0597,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4456.27,1259.98,0.0 +gfx950,256,1152,57344,8192,54,0,254.8822,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4246.4,1458.4,0.0 +gfx950,256,1536,512,7168,0,0,25.5821,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,440.71,348.4,0.0 +gfx950,256,1536,1280,8192,29,0,17.9019,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1799.38,863.96,0.0 +gfx950,256,1536,1536,7168,4,0,29.9466,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1129.44,525.22,0.0 +gfx950,256,1536,2304,16384,42,0,45.0414,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2574.61,855.55,0.0 +gfx950,256,1536,2560,8192,42,0,25.354,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2541.0,971.9,0.0 +gfx950,256,1536,3072,1536,4,0,12.9268,a4w4_blockscale_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1121.35,1003.82,0.0 +gfx950,256,1536,3200,10496,46,0,37.9321,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2720.12,914.4,0.0 +gfx950,256,1536,3456,517376,38,0,1592.2596,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,3449.75,817.7,0.0 +gfx950,256,1536,4608,7168,6,0,54.4245,a4w4_blockscale_256x96x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1864.39,664.7,0.0 +gfx950,256,1536,4608,16384,51,0,65.3407,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,3549.52,986.94,0.0 +gfx950,256,1536,5120,1280,43,0,15.6453,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,1286.82,1277.6,0.0 +gfx950,256,1536,5120,5120,43,0,28.2912,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2846.49,1158.24,0.0 +gfx950,256,1536,5120,6400,43,0,31.8861,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3156.96,1161.25,0.0 +gfx950,256,1536,5120,25600,43,0,100.8438,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3992.84,1000.81,0.0 +gfx950,256,1536,6400,5120,47,0,33.9068,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2968.82,1179.02,0.0 +gfx950,256,1536,7168,2048,10,0,26.9209,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1675.17,1149.03,0.0 +gfx950,256,1536,7168,2304,7,0,29.4102,a4w4_blockscale_256x96x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1725.06,1089.66,0.0 +gfx950,256,1536,7168,8192,50,0,48.3249,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3732.83,1193.42,0.0 +gfx950,256,1536,8192,1024,45,0,17.5535,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1468.07,1717.41,0.0 +gfx950,256,1536,8192,2048,50,0,20.3498,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2532.68,1726.17,0.0 +gfx950,256,1536,8192,3584,54,0,28.3942,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3176.5,1500.25,0.0 +gfx950,256,1536,8192,7168,50,0,46.6282,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3868.66,1287.44,0.0 +gfx950,256,1536,8192,8192,52,0,53.7063,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3838.63,1210.5,0.0 +gfx950,256,1536,8192,28672,50,0,142.6752,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,5057.32,1153.85,0.0 +gfx950,256,1536,9216,16384,54,0,101.515,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4569.34,1146.55,0.0 +gfx950,256,1536,10240,8192,54,0,59.7295,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4314.42,1334.21,0.0 +gfx950,256,1536,12800,5120,47,0,61.135,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3293.15,1243.51,0.0 +gfx950,256,1536,13312,16384,50,0,157.0373,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4266.6,1034.97,0.0 +gfx950,256,1536,14336,8192,54,0,93.3554,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3864.56,1168.14,0.0 +gfx950,256,1536,16384,2048,54,0,36.3137,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2838.58,1891.34,0.0 +gfx950,256,1536,16384,4096,54,0,58.7302,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3510.26,1481.89,0.0 +gfx950,256,1536,16384,6656,50,0,83.9826,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3989.01,1309.43,0.0 +gfx950,256,1536,16384,8192,50,0,99.0521,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4162.63,1249.16,0.0 +gfx950,256,1536,16384,13312,50,0,143.3822,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4672.93,1182.9,0.0 +gfx950,256,1536,16384,26624,50,0,259.8596,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,5156.75,1111.69,0.0 +gfx950,256,1536,26624,16384,54,0,280.4802,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4777.63,1114.07,0.0 +gfx950,256,1536,51200,5120,54,0,185.6814,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4337.03,1574.15,0.0 +gfx950,256,1536,53248,16384,54,0,505.4454,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5302.37,1211.54,0.0 +gfx950,256,1536,57344,8192,54,0,323.7218,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4457.87,1289.17,0.0 +gfx950,256,1552,57344,8192,21,0,402.0203,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3627.03,1042.82,0.0 +gfx950,256,1600,1280,8192,29,0,18.8692,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1778.26,842.24,0.0 +gfx950,256,1600,2304,16384,42,0,45.0896,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2679.02,872.8,0.0 +gfx950,256,1600,2560,8192,46,0,27.4374,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2445.89,919.6,0.0 +gfx950,256,1600,4608,16384,43,0,67.3316,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3588.09,974.31,0.0 +gfx950,256,1600,5120,1280,39,0,15.9852,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x384E,1311.93,1294.0,0.0 +gfx950,256,1600,5120,5120,47,0,30.2039,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2777.33,1112.02,0.0 +gfx950,256,1600,5120,6400,47,0,34.0818,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3076.65,1111.68,0.0 +gfx950,256,1600,5120,25600,47,0,106.7869,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3927.73,958.92,0.0 +gfx950,256,1600,6400,5120,47,0,35.922,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2919.04,1140.25,0.0 +gfx950,256,1600,7168,8192,50,0,52.2949,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3593.18,1125.37,0.0 +gfx950,256,1600,8192,1024,45,0,17.4726,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1536.32,1787.25,0.0 +gfx950,256,1600,8192,2048,54,0,20.5068,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2618.01,1767.29,0.0 +gfx950,256,1600,8192,3584,54,0,29.2052,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3216.98,1498.42,0.0 +gfx950,256,1600,8192,7168,52,0,51.3594,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3658.63,1193.72,0.0 +gfx950,256,1600,8192,8192,52,0,57.7062,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3721.41,1149.31,0.0 +gfx950,256,1600,8192,28672,54,0,161.0625,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4666.63,1034.33,0.0 +gfx950,256,1600,9216,16384,48,0,105.4034,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4584.14,1120.42,0.0 +gfx950,256,1600,10240,8192,47,0,78.9556,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3399.83,1029.24,0.0 +gfx950,256,1600,12800,5120,47,0,63.5804,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3298.43,1224.03,0.0 +gfx950,256,1600,13312,16384,50,0,168.8251,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4134.05,975.91,0.0 +gfx950,256,1600,14336,8192,50,0,98.1944,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3827.2,1131.93,0.0 +gfx950,256,1600,16384,2048,54,0,37.0677,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2896.7,1911.22,0.0 +gfx950,256,1600,16384,4096,54,0,62.2458,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3450.01,1433.99,0.0 +gfx950,256,1600,16384,6656,54,0,91.2148,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3825.76,1230.94,0.0 +gfx950,256,1600,16384,8192,54,0,102.7525,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4179.92,1227.14,0.0 +gfx950,256,1600,16384,13312,54,0,155.5977,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4485.49,1106.25,0.0 +gfx950,256,1600,16384,26624,54,0,292.3706,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4774.3,998.16,0.0 +gfx950,256,1600,26624,16384,48,0,290.0493,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4812.51,1090.88,0.0 +gfx950,256,1600,51200,5120,54,0,209.092,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4011.92,1430.03,0.0 +gfx950,256,1600,53248,16384,54,0,576.4818,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4842.7,1074.98,0.0 +gfx950,256,1600,57344,8192,48,0,336.8595,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4462.51,1261.46,0.0 +gfx950,256,1664,57344,8192,21,0,410.6409,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3807.14,1053.32,0.0 +gfx950,256,1792,5376,4096,47,0,27.5359,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2866.08,1232.85,0.0 +gfx950,256,1792,57344,8192,26,0,432.9381,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3888.84,1034.19,0.0 +gfx950,256,2048,512,4096,21,0,10.1832,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,843.54,720.8,0.0 +gfx950,256,2048,512,7168,0,0,26.4625,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,568.06,425.97,0.0 +gfx950,256,2048,1024,3072,29,0,10.2991,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1251.07,865.41,0.0 +gfx950,256,2048,1280,8192,37,0,21.633,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1985.38,872.48,0.0 +gfx950,256,2048,1536,7168,42,0,22.528,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2001.83,849.45,0.0 +gfx950,256,2048,2048,6144,42,0,20.8673,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2469.87,1004.99,0.0 +gfx950,256,2048,2304,16384,46,0,51.7314,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128E,2988.88,871.59,0.0 +gfx950,256,2048,2560,8192,38,0,31.454,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2730.95,933.43,0.0 +gfx950,256,2048,3072,1536,9,0,16.228,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1190.99,1017.69,0.0 +gfx950,256,2048,3072,6144,51,0,28.7334,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2690.58,985.32,0.0 +gfx950,256,2048,4096,4096,43,0,22.0307,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3119.26,1332.69,0.0 +gfx950,256,2048,4096,8192,43,0,38.7963,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3542.58,1081.11,0.0 +gfx950,256,2048,4096,14336,53,0,64.4578,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,3731.41,943.52,0.0 +gfx950,256,2048,4608,7168,10,0,66.6546,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2029.74,641.06,0.0 +gfx950,256,2048,4608,16384,47,0,77.7096,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3979.4,944.55,0.0 +gfx950,256,2048,5120,1280,10,0,17.0408,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1575.25,1499.87,0.0 +gfx950,256,2048,5120,5120,50,0,34.7884,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3086.49,1130.31,0.0 +gfx950,256,2048,5120,6400,50,0,39.2433,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3420.14,1118.89,0.0 +gfx950,256,2048,5120,25600,50,0,127.207,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4220.45,886.13,0.0 +gfx950,256,2048,6144,3072,54,0,26.6042,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2905.91,1418.9,0.0 +gfx950,256,2048,6144,4096,44,0,31.413,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3281.42,1335.21,0.0 +gfx950,256,2048,6144,4608,44,0,34.4644,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3364.75,1277.85,0.0 +gfx950,256,2048,6144,12288,44,0,76.3642,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4049.51,988.65,0.0 +gfx950,256,2048,6144,16384,44,0,95.5177,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4316.65,966.05,0.0 +gfx950,256,2048,6400,5120,54,0,40.6593,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3301.03,1176.64,0.0 +gfx950,256,2048,7168,2048,9,0,40.0639,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1500.84,968.39,0.0 +gfx950,256,2048,7168,2304,9,0,42.4075,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1595.14,942.69,0.0 +gfx950,256,2048,7168,8192,54,0,57.2065,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4204.39,1173.1,0.0 +gfx950,256,2048,7168,16384,54,0,105.1191,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4576.11,997.51,0.0 +gfx950,256,2048,7168,18432,54,0,118.8045,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4555.1,962.04,0.0 +gfx950,256,2048,8192,1024,54,0,18.164,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1891.64,2135.95,0.0 +gfx950,256,2048,8192,2048,54,0,22.5974,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3041.03,1948.91,0.0 +gfx950,256,2048,8192,3584,54,0,32.4536,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3705.57,1599.35,0.0 +gfx950,256,2048,8192,4096,54,0,37.1442,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3700.15,1467.95,0.0 +gfx950,256,2048,8192,7168,54,0,55.2862,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4350.42,1270.74,0.0 +gfx950,256,2048,8192,8192,54,0,60.7364,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4525.75,1243.04,0.0 +gfx950,256,2048,8192,28672,54,0,181.0399,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5314.15,996.22,0.0 +gfx950,256,2048,9216,16384,47,0,144.1381,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4290.85,902.08,0.0 +gfx950,256,2048,10240,8192,50,0,93.3564,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3680.49,988.41,0.0 +gfx950,256,2048,12288,512,34,0,22.2856,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x768E,1156.34,2423.16,0.0 +gfx950,256,2048,12288,1536,54,0,35.0292,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2207.0,1751.16,0.0 +gfx950,256,2048,12288,4096,44,0,60.5778,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3403.2,1315.53,0.0 +gfx950,256,2048,12288,6144,44,0,83.0992,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3721.31,1135.65,0.0 +gfx950,256,2048,12800,5120,54,0,77.0353,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3484.58,1174.0,0.0 +gfx950,256,2048,13312,16384,54,0,181.8868,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4911.59,991.58,0.0 +gfx950,256,2048,14336,8192,54,0,111.0807,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4330.51,1132.77,0.0 +gfx950,256,2048,16384,512,54,0,23.7227,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1448.39,3027.79,0.0 +gfx950,256,2048,16384,2048,54,0,42.9001,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3203.7,2004.27,0.0 +gfx950,256,2048,16384,4096,54,0,65.2327,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4213.81,1607.44,0.0 +gfx950,256,2048,16384,6656,54,0,94.9655,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4703.57,1352.6,0.0 +gfx950,256,2048,16384,8192,54,0,118.9261,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4622.67,1199.12,0.0 +gfx950,256,2048,16384,13312,54,0,169.6998,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5264.31,1118.4,0.0 +gfx950,256,2048,16384,16384,54,0,216.0855,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5088.32,1009.34,0.0 +gfx950,256,2048,16384,26624,54,0,319.7683,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5587.5,977.19,0.0 +gfx950,256,2048,16384,53248,54,0,619.7511,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5765.88,900.11,0.0 +gfx950,256,2048,18432,7168,48,0,134.1723,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4033.37,1109.75,0.0 +gfx950,256,2048,20480,16384,54,0,281.7409,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4878.2,952.77,0.0 +gfx950,256,2048,24576,1536,54,0,56.4026,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2741.34,2147.25,0.0 +gfx950,256,2048,26624,16384,54,0,372.6865,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4794.13,922.85,0.0 +gfx950,256,2048,28672,4096,54,0,123.3527,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3899.68,1462.11,0.0 +gfx950,256,2048,32768,512,36,0,48.8553,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_64x1024E,1406.59,2929.68,0.0 +gfx950,256,2048,51200,5120,54,0,256.484,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4186.39,1349.13,0.0 +gfx950,256,2048,53248,16384,54,0,682.467,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5236.02,983.33,0.0 +gfx950,256,2048,57344,8192,54,0,393.2207,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4893.3,1215.99,0.0 +gfx950,256,2048,59136,8192,54,0,423.1209,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4689.62,1164.75,0.0 +gfx950,256,2048,106496,16384,54,0,1308.8308,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5460.47,1012.66,0.0 +gfx950,256,2432,3584,738560,47,0,2828.8778,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4551.28,791.49,0.0 +gfx950,256,2880,1280,8192,42,0,25.7621,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2344.45,947.6,0.0 +gfx950,256,2880,2304,16384,51,0,63.6883,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,3414.01,875.17,0.0 +gfx950,256,2880,2560,8192,43,0,36.4457,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3314.41,1015.97,0.0 +gfx950,256,2880,4608,16384,52,0,96.5076,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4506.02,910.64,0.0 +gfx950,256,2880,5120,1280,54,0,17.8654,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2112.95,1937.33,0.0 +gfx950,256,2880,5120,5120,48,0,42.4121,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3560.19,1178.23,0.0 +gfx950,256,2880,5120,6400,54,0,48.4765,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3893.51,1136.45,0.0 +gfx950,256,2880,5120,25600,54,0,153.855,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4907.05,857.24,0.0 +gfx950,256,2880,6400,5120,54,0,65.797,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2868.58,921.33,0.0 +gfx950,256,2880,7168,8192,47,0,87.9683,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3844.89,937.2,0.0 +gfx950,256,2880,8192,1024,40,0,27.0418,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x512E,1786.8,1954.56,0.0 +gfx950,256,2880,8192,2048,50,0,38.258,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2525.92,1529.71,0.0 +gfx950,256,2880,8192,3584,50,0,56.3354,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3001.92,1189.78,0.0 +gfx950,256,2880,8192,7168,50,0,87.1289,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3881.93,997.01,0.0 +gfx950,256,2880,8192,8192,50,0,96.8561,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3990.94,955.41,0.0 +gfx950,256,2880,8192,28672,50,0,274.6138,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4926.61,749.83,0.0 +gfx950,256,2880,9216,16384,52,0,185.5243,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4687.96,820.24,0.0 +gfx950,256,2880,10240,8192,54,0,117.3484,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4117.52,960.57,0.0 +gfx950,256,2880,12800,5120,54,0,106.5865,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3541.61,1068.32,0.0 +gfx950,256,2880,13312,16384,52,0,269.9373,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4653.96,775.45,0.0 +gfx950,256,2880,14336,8192,54,0,155.6029,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4347.33,983.86,0.0 +gfx950,256,2880,16384,2048,54,0,61.497,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3142.81,1855.35,0.0 +gfx950,256,2880,16384,4096,54,0,95.895,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4030.94,1395.53,0.0 +gfx950,256,2880,16384,6656,54,0,138.9253,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4521.42,1140.77,0.0 +gfx950,256,2880,16384,8192,54,0,165.6418,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4667.26,1046.1,0.0 +gfx950,256,2880,16384,13312,54,0,247.3824,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5078.28,899.79,0.0 +gfx950,256,2880,16384,26624,54,0,473.6679,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5304.47,740.63,0.0 +gfx950,256,2880,26624,16384,48,0,523.2043,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4802.25,755.06,0.0 +gfx950,256,2880,51200,5120,54,0,344.4229,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4384.0,1258.21,0.0 +gfx950,256,2880,53248,16384,54,0,961.9976,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5223.62,796.79,0.0 +gfx950,256,2880,57344,8192,54,0,567.131,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4771.08,1017.36,0.0 +gfx950,256,2944,1792,8192,38,0,32.608,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2650.77,918.48,0.0 +gfx950,256,3000,512,7168,29,0,16.2854,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1352.14,961.54,0.0 +gfx950,256,3000,2112,7168,51,0,33.1164,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2742.84,935.89,0.0 +gfx950,256,3000,3072,1536,47,0,15.3407,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,1845.52,1505.49,0.0 +gfx950,256,3000,7168,256,40,0,17.4695,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x512E,630.24,2536.39,0.0 +gfx950,256,3000,7168,2048,50,0,37.8602,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,2326.46,1410.98,0.0 +gfx950,256,3072,57344,8192,21,0,735.2449,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3925.52,815.76,0.0 +gfx950,256,3200,1280,8192,42,0,26.6539,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2517.79,995.8,0.0 +gfx950,256,3200,2304,16384,43,0,69.1244,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3495.03,865.6,0.0 +gfx950,256,3200,2560,8192,43,0,39.9461,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3359.97,1000.77,0.0 +gfx950,256,3200,4608,16384,54,0,106.0754,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4555.1,881.02,0.0 +gfx950,256,3200,5120,1280,45,0,19.7857,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2119.87,1925.27,0.0 +gfx950,256,3200,5120,5120,45,0,47.9925,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3495.8,1126.58,0.0 +gfx950,256,3200,5120,6400,45,0,54.7726,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3828.83,1084.34,0.0 +gfx950,256,3200,5120,25600,45,0,176.123,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4762.93,790.72,0.0 +gfx950,256,3200,6400,5120,47,0,69.1363,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3033.36,947.92,0.0 +gfx950,256,3200,7168,8192,50,0,97.6826,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3847.25,904.38,0.0 +gfx950,256,3200,8192,1024,54,0,28.482,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1884.95,2045.56,0.0 +gfx950,256,3200,8192,2048,54,0,40.9605,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2621.41,1564.78,0.0 +gfx950,256,3200,8192,3584,54,0,57.3302,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3277.59,1270.59,0.0 +gfx950,256,3200,8192,7168,54,0,94.5081,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3976.48,986.77,0.0 +gfx950,256,3200,8192,8192,54,0,104.4674,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4111.3,948.53,0.0 +gfx950,256,3200,8192,28672,52,0,305.1042,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4926.97,707.12,0.0 +gfx950,256,3200,9216,16384,54,0,197.1671,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4901.26,815.02,0.0 +gfx950,256,3200,10240,8192,45,0,122.2919,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4390.08,986.05,0.0 +gfx950,256,3200,12800,5120,54,0,109.5218,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3829.65,1121.97,0.0 +gfx950,256,3200,13312,16384,54,0,287.2469,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4859.46,767.5,0.0 +gfx950,256,3200,14336,8192,54,0,162.3098,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4630.77,1007.81,0.0 +gfx950,256,3200,16384,2048,54,0,71.5292,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3002.25,1746.3,0.0 +gfx950,256,3200,16384,4096,54,0,111.6874,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3845.53,1297.96,0.0 +gfx950,256,3200,16384,6656,54,0,163.4773,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4269.29,1040.1,0.0 +gfx950,256,3200,16384,8192,47,0,196.8084,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4364.62,940.37,0.0 +gfx950,256,3200,16384,13312,47,0,292.815,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4767.05,803.27,0.0 +gfx950,256,3200,16384,26624,47,0,576.3884,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4843.49,634.22,0.0 +gfx950,256,3200,26624,16384,54,0,556.8581,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5013.36,744.74,0.0 +gfx950,256,3200,51200,5120,54,0,386.9368,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4335.91,1206.77,0.0 +gfx950,256,3200,53248,16384,48,0,1128.8789,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4946.02,711.51,0.0 +gfx950,256,3200,57344,8192,54,0,649.529,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4628.7,946.82,0.0 +gfx950,256,3712,951552,4352,54,0,6920.3238,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4442.56,1321.18,0.0 +gfx950,256,4096,512,4096,29,0,10.9658,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1566.68,1243.09,0.0 +gfx950,256,4096,512,7168,0,0,27.3037,a4w4_blockscale_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1101.12,758.48,0.0 +gfx950,256,4096,800,5120,21,0,39.6258,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,846.78,481.69,0.0 +gfx950,256,4096,1024,3072,30,0,14.0452,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x256E,1834.78,1157.19,0.0 +gfx950,256,4096,1280,8192,38,0,32.2517,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,2663.41,1007.88,0.0 +gfx950,256,4096,1536,7168,51,0,33.603,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128E,2684.11,975.15,0.0 +gfx950,256,4096,2048,6144,43,0,33.101,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,3114.08,1077.05,0.0 +gfx950,256,4096,2304,16384,47,0,78.0812,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3960.46,913.19,0.0 +gfx950,256,4096,2560,8192,50,0,49.3258,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3482.94,977.88,0.0 +gfx950,256,4096,3072,1536,10,0,28.6765,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1347.96,1069.55,0.0 +gfx950,256,4096,3072,6144,44,0,44.4912,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3475.27,1060.57,0.0 +gfx950,256,4096,4096,4096,54,0,37.9223,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3624.23,1327.23,0.0 +gfx950,256,4096,4096,8192,54,0,63.8634,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4304.15,1050.82,0.0 +gfx950,256,4096,4096,14336,54,0,104.59,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4599.26,882.25,0.0 +gfx950,256,4096,4608,7168,10,0,131.1566,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2063.05,525.66,0.0 +gfx950,256,4096,4608,16384,47,0,144.7034,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4274.09,753.62,0.0 +gfx950,256,4096,5120,1280,33,0,30.2367,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x640E,1775.56,1582.23,0.0 +gfx950,256,4096,5120,5120,50,0,67.9041,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3162.52,965.13,0.0 +gfx950,256,4096,5120,6400,50,0,77.6951,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3454.99,919.42,0.0 +gfx950,256,4096,5120,25600,50,0,238.1354,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,4508.96,671.5,0.0 +gfx950,256,4096,6144,3072,44,0,52.4445,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,2948.24,1259.62,0.0 +gfx950,256,4096,6144,4096,44,0,61.9246,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3329.18,1151.45,0.0 +gfx950,256,4096,6144,12288,44,0,144.5997,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4277.15,783.17,0.0 +gfx950,256,4096,6144,16384,44,0,177.7384,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4639.59,755.14,0.0 +gfx950,256,4096,6400,5120,54,0,81.4407,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3296.08,973.7,0.0 +gfx950,256,4096,7168,2048,10,0,77.5343,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1551.04,906.11,0.0 +gfx950,256,4096,7168,2304,5,0,80.8616,a4w4_blockscale_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1673.12,886.66,0.0 +gfx950,256,4096,7168,8192,54,0,110.3777,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4358.09,949.99,0.0 +gfx950,256,4096,7168,16384,54,0,197.1971,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4878.74,765.71,0.0 +gfx950,256,4096,7168,18432,54,0,218.7404,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4948.02,743.02,0.0 +gfx950,256,4096,8192,1024,54,0,30.4329,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2258.07,2411.87,0.0 +gfx950,256,4096,8192,2048,54,0,43.3485,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3170.56,1838.4,0.0 +gfx950,256,4096,8192,3584,54,0,62.8575,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3826.4,1417.95,0.0 +gfx950,256,4096,8192,4096,54,0,68.705,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4000.84,1343.06,0.0 +gfx950,256,4096,8192,7168,54,0,104.1747,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4617.59,1066.95,0.0 +gfx950,256,4096,8192,8192,54,0,116.6368,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4713.4,1006.89,0.0 +gfx950,256,4096,8192,28672,54,0,345.2111,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5573.82,704.7,0.0 +gfx950,256,4096,9216,16384,44,0,253.5015,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4879.46,728.0,0.0 +gfx950,256,4096,10240,8192,54,0,156.2228,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4398.81,912.84,0.0 +gfx950,256,4096,12288,512,54,0,34.8007,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1480.99,3013.09,0.0 +gfx950,256,4096,12288,1536,54,0,57.2009,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2703.08,1979.8,0.0 +gfx950,256,4096,12288,4096,54,0,101.7739,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4051.3,1318.78,0.0 +gfx950,256,4096,12288,6144,54,0,138.7122,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4458.69,1088.55,0.0 +gfx950,256,4096,12800,5120,54,0,139.4543,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3849.8,1062.08,0.0 +gfx950,256,4096,13312,16384,54,0,367.441,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4862.57,684.89,0.0 +gfx950,256,4096,14336,8192,54,0,207.3872,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4639.02,930.33,0.0 +gfx950,256,4096,16384,512,54,0,45.8036,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1500.31,3044.75,0.0 +gfx950,256,4096,16384,2048,54,0,82.564,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3329.27,1879.62,0.0 +gfx950,256,4096,16384,4096,45,0,129.4044,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4248.35,1361.32,0.0 +gfx950,256,4096,16384,6656,54,0,186.3419,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4794.16,1086.04,0.0 +gfx950,256,4096,16384,8192,54,0,215.1571,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5110.27,1013.7,0.0 +gfx950,256,4096,16384,13312,54,0,335.7525,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5321.5,805.75,0.0 +gfx950,256,4096,16384,16384,54,0,394.8506,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5569.25,764.82,0.0 +gfx950,256,4096,16384,26624,54,0,626.5094,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5703.69,649.39,0.0 +gfx950,256,4096,16384,53248,54,0,1240.1583,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5762.83,547.9,0.0 +gfx950,256,4096,18432,7168,54,0,235.4389,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4597.08,984.27,0.0 +gfx950,256,4096,20480,16384,54,0,506.1778,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5430.46,729.19,0.0 +gfx950,256,4096,24576,1536,45,0,106.1324,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2913.7,2104.42,0.0 +gfx950,256,4096,26624,16384,54,0,656.538,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5442.81,715.51,0.0 +gfx950,256,4096,28672,4096,54,0,220.5013,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4363.12,1369.56,0.0 +gfx950,256,4096,32768,512,36,0,90.4435,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_64x1024E,1519.61,3072.33,0.0 +gfx950,256,4096,51200,5120,54,0,478.3978,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4488.91,1172.64,0.0 +gfx950,256,4096,53248,16384,54,0,1300.5576,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5495.2,696.6,0.0 +gfx950,256,4096,57344,8192,54,0,744.0942,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5171.78,969.53,0.0 +gfx950,256,4096,59136,8192,54,0,791.8357,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5011.83,938.88,0.0 +gfx950,256,4096,106496,16384,54,0,2568.3523,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5565.3,692.42,0.0 +gfx950,256,4224,4096,768,38,0,19.9861,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256E,1329.68,1891.21,0.0 +gfx950,256,4736,44416,17920,54,0,1389.8884,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5424.24,619.55,0.0029 +gfx950,256,4992,4864,7168,50,0,96.904,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3592.15,865.66,0.0 +gfx950,256,5120,14336,8192,54,0,256.5179,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4688.14,882.95,0.0 +gfx950,256,5504,6144,5376,50,0,102.785,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3537.44,962.62,0.0 +gfx950,256,6016,4864,7168,54,0,106.812,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3927.44,912.98,0.0 +gfx950,256,6144,14336,8192,54,0,308.834,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4672.77,842.03,0.0 +gfx950,256,6272,2688,68096,47,0,535.6359,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4286.64,632.5,0.0 +gfx950,256,6528,7040,5632,54,0,128.2402,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4036.65,1014.67,0.0088 +gfx950,256,6656,2560,3328,47,0,44.3137,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,2559.34,1115.1,0.0 +gfx950,256,7040,896,1280,43,0,11.9928,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,1346.48,1475.45,0.0 +gfx950,256,7168,6272,5888,54,0,139.2251,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3802.63,930.02,0.0019 +gfx950,256,7680,14336,8192,54,0,371.2804,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4858.56,835.97,0.0 +gfx950,256,7808,5120,7168,54,0,141.1399,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4060.58,894.77,0.0 +gfx950,256,7808,7040,5376,54,0,150.5516,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3925.69,995.33,0.0112 +gfx950,256,8192,512,4096,42,0,16.4152,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2093.17,1596.96,0.0 +gfx950,256,8192,512,7168,1,0,37.1971,a4w4_blockscale_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1616.51,1064.16,0.0 +gfx950,256,8192,800,5120,21,0,41.7607,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1606.99,865.09,0.0 +gfx950,256,8192,1024,3072,43,0,20.267,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2543.03,1526.27,0.0 +gfx950,256,8192,1280,8192,50,0,51.423,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3340.89,1162.3,0.0 +gfx950,256,8192,1536,7168,44,0,48.906,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3688.48,1227.48,0.0 +gfx950,256,8192,2048,6144,54,0,55.3677,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3723.44,1174.18,0.0 +gfx950,256,8192,2304,16384,47,0,151.5138,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,4081.97,816.64,0.0 +gfx950,256,8192,2560,8192,50,0,96.5366,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3559.24,890.68,0.0 +gfx950,256,8192,3072,1536,10,0,50.2556,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1538.32,1173.65,0.0 +gfx950,256,8192,3072,6144,44,0,86.0465,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3593.84,987.08,0.0 +gfx950,256,8192,4096,4096,54,0,70.1619,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3917.77,1315.17,0.0 +gfx950,256,8192,4096,8192,54,0,120.7533,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4552.72,972.57,0.0 +gfx950,256,8192,4096,14336,54,0,189.8393,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5067.83,817.48,0.0 +gfx950,256,8192,4608,7168,10,0,227.7773,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2375.86,532.86,0.0 +gfx950,256,8192,4608,16384,44,0,262.9408,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4704.29,685.92,0.0 +gfx950,256,8192,5120,1280,45,0,49.1678,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2183.83,1879.4,0.0 +gfx950,256,8192,5120,5120,52,0,116.327,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,3692.15,1014.08,0.0 +gfx950,256,8192,5120,6400,54,0,132.8247,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4041.95,952.27,0.0 +gfx950,256,8192,5120,25600,52,0,429.3911,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,5001.23,592.19,0.0 +gfx950,256,8192,6144,3072,54,0,87.9918,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3514.39,1394.26,0.0 +gfx950,256,8192,6144,4096,45,0,106.5776,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3868.7,1219.99,0.0 +gfx950,256,8192,6144,12288,54,0,240.4063,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5145.25,785.1,0.0 +gfx950,256,8192,6144,16384,54,0,308.4804,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5346.43,707.03,0.0 +gfx950,256,8192,6400,5120,54,0,148.146,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3623.93,959.95,0.0 +gfx950,256,8192,7168,2048,10,0,135.959,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1769.05,979.48,0.0 +gfx950,256,8192,7168,2304,10,0,145.1712,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1863.89,930.87,0.0 +gfx950,256,8192,7168,8192,54,0,210.4687,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4571.1,856.92,0.0 +gfx950,256,8192,7168,16384,54,0,376.4758,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5110.94,646.18,0.0 +gfx950,256,8192,7168,18432,54,0,419.0895,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5165.16,618.0,0.0 +gfx950,256,8192,8192,1024,54,0,55.9419,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2456.82,2549.19,0.0 +gfx950,256,8192,8192,2048,54,0,84.2984,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3260.77,1791.2,0.0 +gfx950,256,8192,8192,3584,54,0,119.4879,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4025.82,1368.99,0.0 +gfx950,256,8192,8192,4096,54,0,129.4913,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4245.5,1295.62,0.0 +gfx950,256,8192,8192,7168,54,0,197.8445,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4862.77,975.2,0.0 +gfx950,256,8192,8192,8192,54,0,216.0155,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5089.97,932.0,0.0 +gfx950,256,8192,8192,28672,54,0,668.518,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5756.45,552.11,0.0 +gfx950,256,8192,9216,16384,54,0,475.0672,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5207.48,618.02,0.0 +gfx950,256,8192,10240,8192,45,0,277.173,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4958.6,877.68,0.0 +gfx950,256,8192,12288,512,45,0,66.5444,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1549.03,3104.24,0.0 +gfx950,256,8192,12288,1536,45,0,107.0387,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2889.03,2027.82,0.0 +gfx950,256,8192,12288,4096,45,0,190.4456,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4330.02,1277.37,0.0 +gfx950,256,8192,12288,6144,45,0,258.7637,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4780.23,1021.17,0.0 +gfx950,256,8192,12800,5120,54,0,257.7219,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4166.28,1022.24,0.0 +gfx950,256,8192,13312,16384,54,0,671.4716,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5321.76,587.16,0.0 +gfx950,256,8192,14336,8192,54,0,387.4984,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4965.56,844.28,0.0 +gfx950,256,8192,16384,512,54,0,86.6263,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1586.57,3171.4,0.0 +gfx950,256,8192,16384,2048,45,0,155.4501,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3536.54,1888.72,0.0 +gfx950,256,8192,16384,4096,54,0,242.0538,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4542.43,1316.93,0.0 +gfx950,256,8192,16384,6656,54,0,366.2828,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4877.94,956.16,0.0 +gfx950,256,8192,16384,8192,54,0,425.1217,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5172.69,868.22,0.0 +gfx950,256,8192,16384,13312,54,0,657.0434,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5438.63,657.51,0.0 +gfx950,256,8192,16384,16384,54,0,786.0032,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5595.46,597.66,0.0 +gfx950,256,8192,16384,26624,54,0,1246.495,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5733.54,477.81,0.0 +gfx950,256,8192,16384,53248,54,0,2463.2448,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5802.77,374.61,0.0 +gfx950,256,8192,18432,7168,54,0,435.3143,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4972.65,912.93,0.0 +gfx950,256,8192,20480,16384,54,0,982.1553,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5597.44,580.79,0.0 +gfx950,256,8192,24576,1536,45,0,208.6912,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2963.59,2050.01,0.0 +gfx950,256,8192,26624,16384,54,0,1399.3116,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5107.39,515.55,0.0 +gfx950,256,8192,28672,4096,54,0,433.7855,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4435.71,1256.98,0.0 +gfx950,256,8192,32768,512,36,0,172.7725,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_64x1024E,1590.98,3168.08,0.0 +gfx950,256,8192,51200,5120,54,0,926.9073,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4633.65,1069.04,0.0 +gfx950,256,8192,53248,16384,54,0,2561.6588,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5579.84,537.05,0.0 +gfx950,256,8192,57344,8192,54,0,1433.2252,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5370.11,842.83,0.0 +gfx950,256,8192,59136,8192,54,0,1525.5281,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5202.85,815.89,0.0 +gfx950,256,8192,106496,16384,54,0,5127.9293,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5574.82,523.48,0.0 +gfx950,256,9984,15360,13824,54,0,814.223,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5207.34,591.84,0.0 +gfx950,256,12416,8960,15360,54,0,651.6207,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5244.64,593.39,0.0 +gfx950,256,12416,11136,12544,54,0,686.4753,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5053.03,618.01,0.0115 +gfx950,256,12800,9344,12800,54,0,611.4549,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5007.47,622.99,0.0137 +gfx950,256,13056,43392,1792,54,0,607.1164,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3344.38,1949.59,0.0029 +gfx950,256,13568,13312,10240,54,0,733.9988,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5039.57,679.65,0.0 +gfx950,256,14720,13568,8704,54,0,727.3392,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4780.08,718.44,0.0 +gfx950,256,16384,512,4096,43,0,24.3647,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256E,2820.45,2108.8,0.0 +gfx950,256,16384,512,7168,9,0,60.4461,a4w4_blockscale_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1989.53,1279.36,0.0 +gfx950,256,16384,800,5120,21,0,48.5545,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2764.27,1445.91,0.0 +gfx950,256,16384,1024,3072,45,0,33.1376,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3110.64,1819.48,0.0 +gfx950,256,16384,1280,8192,50,0,98.537,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3486.99,1159.92,0.0 +gfx950,256,16384,1536,7168,44,0,94.2122,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,3829.41,1215.95,0.0 +gfx950,256,16384,2048,6144,45,0,102.2609,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4032.01,1209.96,0.0 +gfx950,256,16384,2304,16384,44,0,264.3801,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384E,4678.68,864.62,0.0 +gfx950,256,16384,2560,8192,45,0,164.9847,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4165.2,978.76,0.0 +gfx950,256,16384,3072,1536,10,0,97.6538,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1583.34,1183.83,0.0 +gfx950,256,16384,3072,6144,54,0,141.9524,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4356.92,1130.18,0.0 +gfx950,256,16384,4096,4096,54,0,132.0097,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4164.51,1334.45,0.0 +gfx950,256,16384,4096,8192,54,0,226.9404,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4844.94,961.06,0.0 +gfx950,256,16384,4096,14336,54,0,363.6314,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5291.47,772.81,0.0 +gfx950,256,16384,4608,7168,10,0,454.2569,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2382.64,498.02,0.0 +gfx950,256,16384,4608,16384,54,0,486.8838,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5081.09,663.32,0.0 +gfx950,256,16384,5120,1280,45,0,84.5625,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2539.52,2146.75,0.0 +gfx950,256,16384,5120,5120,45,0,200.0472,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4293.95,1113.85,0.0 +gfx950,256,16384,5120,6400,54,0,234.6066,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4576.78,1008.43,0.0 +gfx950,256,16384,5120,25600,54,0,815.9065,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5264.04,542.98,0.0 +gfx950,256,16384,6144,3072,45,0,161.8127,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3822.17,1458.04,0.0 +gfx950,256,16384,6144,4096,54,0,193.6462,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4258.46,1277.92,0.0 +gfx950,256,16384,6144,12288,54,0,467.7627,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5288.8,726.31,0.0 +gfx950,256,16384,6144,16384,54,0,600.1017,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5496.63,643.02,0.0 +gfx950,256,16384,6400,5120,45,0,268.7424,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3995.43,997.39,0.012499047 +gfx950,256,16384,7168,2048,10,0,269.441,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1785.31,961.24,0.0 +gfx950,256,16384,7168,2304,10,0,288.9433,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1872.91,906.8,0.0 +gfx950,256,16384,7168,8192,54,0,383.7231,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5014.41,863.51,0.0 +gfx950,256,16384,7168,16384,54,0,703.6728,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5468.86,607.98,0.0 +gfx950,256,16384,7168,18432,54,0,791.4978,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5469.79,570.99,0.0 +gfx950,256,16384,8192,1024,54,0,107.806,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2549.75,2606.7,0.0 +gfx950,256,16384,8192,2048,54,0,159.5154,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3446.41,1840.58,0.0 +gfx950,256,16384,8192,3584,54,0,233.3336,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4123.16,1339.18,0.0 +gfx950,256,16384,8192,4096,54,0,254.4673,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4320.84,1252.68,0.0 +gfx950,256,16384,8192,7168,54,0,398.3153,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4830.71,895.06,0.0 +gfx950,256,16384,8192,8192,54,0,429.289,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5122.48,859.79,0.0 +gfx950,256,16384,8192,28672,54,0,1325.5902,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5806.15,468.29,0.0 +gfx950,256,16384,9216,16384,54,0,901.7683,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5486.78,567.45,0.0 +gfx950,256,16384,10240,8192,54,0,544.3326,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5049.82,816.77,0.0 +gfx950,256,16384,12288,512,45,0,125.1737,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1646.98,3275.39,0.0 +gfx950,256,16384,12288,1536,45,0,206.8407,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2990.1,2053.14,0.0 +gfx950,256,16384,12288,4096,45,0,370.0402,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4457.0,1246.82,0.0 +gfx950,256,16384,12288,6144,54,0,509.7273,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4853.38,962.74,0.0 +gfx950,256,16384,12800,5120,54,0,486.651,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4412.78,1015.39,0.0 +gfx950,256,16384,13312,16384,54,0,1289.7301,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5541.33,526.84,0.0 +gfx950,256,16384,14336,8192,54,0,757.6926,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5078.96,786.06,0.0 +gfx950,256,16384,16384,512,54,0,156.2988,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1758.67,3488.57,0.0 +gfx950,256,16384,16384,2048,54,0,303.603,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3621.54,1878.85,0.0 +gfx950,256,16384,16384,4096,54,0,478.5568,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4595.11,1262.09,0.0 +gfx950,256,16384,16384,6656,54,0,721.8789,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4950.16,894.78,0.0 +gfx950,256,16384,16384,8192,54,0,832.934,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5280.19,805.69,0.0 +gfx950,256,16384,16384,13312,54,0,1296.2628,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5513.41,582.42,0.0 +gfx950,256,16384,16384,16384,54,0,1563.1208,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5627.26,515.19,0.0 +gfx950,256,16384,16384,26624,54,0,2485.1392,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5751.65,391.56,0.0 +gfx950,256,16384,16384,53248,54,0,5103.8959,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5601.07,276.12,0.0 +gfx950,256,16384,18432,7168,54,0,862.0836,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5021.93,845.35,0.0 +gfx950,256,16384,18432,16384,10,0,4031.1796,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2454.77,220.58,0.0 +gfx950,256,16384,20480,16384,45,0,1960.3005,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,5608.89,496.39,0.0 +gfx950,256,16384,24576,1536,45,0,399.9546,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3092.73,2092.15,0.0 +gfx950,256,16384,26624,16384,54,0,2587.5139,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5524.09,473.33,0.0 +gfx950,256,16384,28672,4096,54,0,861.417,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4467.4,1197.79,0.0 +gfx950,256,16384,32768,512,36,0,345.2436,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_64x1024E,1592.37,3146.55,0.0 +gfx950,256,16384,51200,5120,54,0,1983.796,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4330.05,932.93,0.0 +gfx950,256,16384,53248,16384,54,0,5172.8223,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5526.44,447.58,0.0 +gfx950,256,16384,57344,8192,54,0,2953.8423,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5211.23,738.37,0.0 +gfx950,256,16384,59136,8192,54,0,3103.1713,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5115.48,724.13,0.0 +gfx950,256,16384,106496,16384,54,0,10335.4736,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5531.88,435.04,0.0 +gfx950,256,16896,31104,7168,54,0,1565.4111,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4812.82,781.33,0.0041 +gfx950,256,20480,512,7168,7,0,72.5979,a4w4_blockscale_256x96x512x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2070.64,1325.2,0.0 +gfx950,256,20480,1536,7168,10,0,199.649,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2258.82,710.35,0.0 +gfx950,256,20480,3072,1536,10,0,118.3924,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1632.48,1215.59,0.0 +gfx950,256,20480,4608,7168,10,0,572.5215,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,2363.08,486.72,0.0 +gfx950,256,20480,7168,2048,10,0,333.465,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1803.17,965.36,0.0 +gfx950,256,20480,7168,2304,10,0,355.0349,a4w4_blockscale_256x128x384x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3,1905.33,916.68,0.0 +gfx950,256,22784,1664,61696,52,0,1024.4489,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,4566.46,810.19,0.0226 +gfx950,256,24192,32384,2304,45,0,1001.4336,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3604.89,1629.71,0.0043 +gfx950,256,32768,1024,3072,45,0,61.3382,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3361.01,1940.28,0.0 +gfx950,256,32768,1280,8192,54,0,167.5074,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4102.47,1333.35,0.0 +gfx950,256,32768,1536,7168,45,0,162.1303,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4450.46,1379.19,0.0 +gfx950,256,32768,2048,6144,45,0,190.7134,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4323.94,1264.58,0.0 +gfx950,256,32768,2112,7168,48,0,293.0219,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3385.88,898.98,0.0 +gfx950,256,32768,2304,16384,48,0,505.9259,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,4889.85,866.34,0.0 +gfx950,256,32768,2560,8192,54,0,293.3453,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4685.23,1065.21,0.0 +gfx950,256,32768,3072,6144,45,0,272.8936,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4532.72,1141.2,0.0 +gfx950,256,32768,4096,4096,54,0,255.6975,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4300.05,1345.08,0.0 +gfx950,256,32768,4096,8192,54,0,444.8527,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4943.26,942.85,0.0 +gfx950,256,32768,4096,14336,54,0,721.3297,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5335.0,738.46,0.0 +gfx950,256,32768,4608,16384,54,0,906.1081,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5460.5,671.19,0.0 +gfx950,256,32768,5120,1280,45,0,163.3916,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,2628.63,2202.03,0.0 +gfx950,256,32768,5120,5120,45,0,392.6814,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4375.01,1101.5,0.0 +gfx950,256,32768,5120,6400,54,0,457.7773,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4691.11,997.83,0.0 +gfx950,256,32768,5120,25600,54,0,1545.9348,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5556.47,530.75,0.0 +gfx950,256,32768,6144,3072,54,0,315.1874,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3924.49,1467.13,0.0 +gfx950,256,32768,6144,4096,54,0,377.8643,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4364.71,1276.5,0.0 +gfx950,256,32768,6144,12288,54,0,933.4489,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5300.56,687.48,0.0 +gfx950,256,32768,6144,16384,54,0,1198.9958,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5502.16,601.69,0.0 +gfx950,256,32768,6400,5120,45,0,518.2698,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4143.56,1002.76,0.016083188 +gfx950,256,32768,7168,8192,54,0,763.9579,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5037.31,829.02,0.0 +gfx950,256,32768,7168,16384,54,0,1414.8817,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5439.73,563.24,0.0 +gfx950,256,32768,7168,18432,54,0,1586.5235,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5457.63,528.08,0.0 +gfx950,256,32768,8192,1024,54,0,211.6275,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2597.75,2635.96,0.0 +gfx950,256,32768,8192,2048,45,0,315.6601,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3483.21,1833.66,0.0 +gfx950,256,32768,8192,3584,54,0,464.7093,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4140.54,1313.23,0.0 +gfx950,256,32768,8192,4096,54,0,483.3855,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4549.21,1284.19,0.0 +gfx950,256,32768,8192,7168,54,0,782.3144,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4919.11,873.91,0.0 +gfx950,256,32768,8192,8192,54,0,843.7771,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5212.33,835.11,0.0 +gfx950,256,32768,8192,28672,54,0,2654.9427,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5797.93,423.39,0.0 +gfx950,256,32768,9216,16384,54,0,1780.1369,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5558.9,532.49,0.0 +gfx950,256,32768,10240,8192,45,0,1088.4791,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,5050.68,778.38,0.0 +gfx950,256,32768,12288,512,45,0,245.9473,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,1676.44,3321.2,0.0 +gfx950,256,32768,12288,1536,45,0,400.2607,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3090.36,2098.41,0.0 +gfx950,256,32768,12288,4096,45,0,732.0118,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4506.12,1226.18,0.0 +gfx950,256,32768,12288,6144,45,0,1020.7607,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4847.17,924.52,0.0 +gfx950,256,32768,12800,5120,54,0,997.4188,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4306.08,957.99,0.0 +gfx950,256,32768,13312,16384,54,0,2590.6993,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5517.29,482.46,0.0 +gfx950,256,32768,14336,8192,54,0,1537.4579,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5006.04,736.58,0.0 +gfx950,256,32768,16384,512,54,0,319.1351,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1722.64,3403.97,0.0 +gfx950,256,32768,16384,2048,45,0,593.0636,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3707.9,1895.37,0.0 +gfx950,256,32768,16384,4096,54,0,958.7045,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4587.49,1224.99,0.0 +gfx950,256,32768,16384,6656,54,0,1434.6488,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4981.59,862.45,0.0 +gfx950,256,32768,16384,8192,54,0,1680.4113,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5234.49,758.78,0.0 +gfx950,256,32768,16384,13312,54,0,2618.0472,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5459.66,535.09,0.0 +gfx950,256,32768,16384,16384,45,0,3077.3314,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,5716.7,479.76,0.0 +gfx950,256,32768,16384,26624,54,0,4978.7035,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5741.92,347.09,0.0 +gfx950,256,32768,16384,53248,54,0,9949.223,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5746.64,239.45,0.0 +gfx950,256,32768,18432,7168,45,0,1748.9945,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4950.65,795.58,0.0 +gfx950,256,32768,20480,16384,45,0,3960.5471,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,5552.32,449.03,0.0 +gfx950,256,32768,24576,1536,45,0,800.9843,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3088.58,2065.77,0.0 +gfx950,256,32768,26624,16384,54,0,5494.2956,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5203.09,406.12,0.0 +gfx950,256,32768,28672,4096,45,0,1707.3148,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4508.0,1174.29,0.0 +gfx950,256,32768,32768,512,36,0,666.2533,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_64x1024E,1650.29,3248.41,0.0 +gfx950,256,32768,51200,5120,54,0,4066.2238,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4225.02,878.06,0.0 +gfx950,256,32768,53248,16384,54,0,10524.9535,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5432.29,398.51,0.0 +gfx950,256,32768,57344,8192,54,0,6141.7014,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5012.67,672.0,0.0 +gfx950,256,32768,59136,8192,54,0,6136.6529,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5173.57,692.88,0.0 +gfx950,256,32768,106496,16384,54,0,20606.9162,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5549.07,394.05,0.0 +gfx950,256,33280,7168,10752,54,0,999.5708,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5132.0,694.85,0.0 +gfx950,256,35200,256,19968,50,0,106.6743,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256E,3373.55,3487.39,0.0 +gfx950,256,48256,61056,5376,45,0,7047.1599,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4495.26,877.87,0.0052 +gfx950,256,51712,14976,7680,54,0,2503.0596,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4752.34,721.1,0.0085 +gfx950,256,51968,61696,4608,54,0,6682.8336,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4421.55,998.73,0.0 +gfx950,256,56832,44416,1280,52,0,2488.2075,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256E,2597.08,2055.01,0.0029 +gfx950,256,60000,4096,512,54,0,165.0566,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,1524.68,3077.3,0.0 +gfx950,256,64896,1280,60672,54,0,2002.6549,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5033.15,1085.38,0.0 +gfx950,256,65536,2112,7168,48,0,575.6528,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3447.0,902.06,0.0 +gfx950,256,65536,16384,8192,54,0,3353.2726,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,5246.27,740.48,0.0 +gfx950,256,168448,3200,6400,54,0,1573.7469,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4384.21,1034.05,0.0189 +gfx950,256,540544,7552,1024,54,0,3458.7874,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2417.13,2441.61,0.0082 +gfx950,256,721536,6016,1024,54,0,3558.6675,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,2498.09,2544.22,0.0104 +gfx950,256,838784,5760,3584,45,0,8593.0777,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,4030.16,1300.61,0.031 diff --git a/aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv b/aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv index 3818fa173d..6716ab0267 100644 --- a/aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv +++ b/aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv @@ -1,59 +1,59 @@ -cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,16,512,7168,asm,6,7,7.9691,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,14.74,476.98,0.0029 -256,32,512,7168,asm,22,7,9.1625,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,25.64,429.16,0.0033 -256,64,512,7168,asm,15,8,8.5433,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,54.99,490.95,0.0078 -256,128,512,7168,asm,23,8,9.7489,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,96.37,484.01,0.0139 -256,256,512,7168,asm,6,7,9.7151,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,193.42,593.63,0.0036 -256,512,512,7168,asm,23,8,11.511,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,326.48,683.2,0.009 -256,1024,512,7168,asm,27,4,15.6429,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,480.49,770.87,0.0012 -256,1536,512,7168,asm,35,4,18.8821,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,597.09,860.76,0.0022 -256,2048,512,7168,ck,17,3,21.5224,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,698.45,950.04,0.0 -256,4096,512,7168,asm,16,1,32.8452,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,915.35,1133.33,0.0 -256,8192,512,7168,asm,40,1,47.2238,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1273.29,1498.8,0.0 -256,16384,512,7168,ck,13,2,81.4869,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1475.81,1692.15,0.0 -256,20480,512,7168,ck,13,2,111.424,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1349.12,1538.65,0.0 -256,16,576,7168,ck,7,0,17.6513,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.49,241.45,0.0 -256,32,576,7168,ck,7,3,17.088,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,15.46,257.2,0.0 -256,64,576,7168,ck,7,2,17.5138,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,30.18,266.15,0.0 -256,128,576,7168,ck,7,3,16.4658,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,64.19,315.43,0.0 -256,256,576,7168,ck,7,2,16.9154,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,124.97,370.0,0.0 -256,512,576,7168,ck,12,3,18.0012,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,234.87,466.0,0.0 -256,1024,576,7168,ck,17,3,20.7538,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,407.43,609.45,0.0 -256,1536,576,7168,ck,17,2,21.2038,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,598.17,797.42,0.0 -256,2048,576,7168,ck,12,3,31.2412,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,541.32,677.57,0.0 -256,4096,576,7168,ck,15,0,43.7737,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,772.68,872.84,0.004 -256,8192,576,7168,ck,15,1,75.2384,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,899.09,960.76,0.0074 -256,16384,576,7168,ck,15,0,135.1229,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1001.25,1039.38,0.0088 -256,20480,576,7168,ck,15,2,164.23,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1029.74,1062.67,0.0089 -256,16,1536,7168,asm,15,8,8.9852,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,39.21,1243.59,0.0081 -256,32,1536,7168,asm,15,8,8.8914,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,79.25,1275.13,0.0089 -256,64,1536,7168,asm,15,8,9.5956,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,146.87,1215.7,0.0072 -256,128,1536,7168,asm,23,8,10.3811,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,271.51,1186.85,0.0094 -256,256,1536,7168,asm,20,5,13.4798,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,418.19,1011.25,0.0018 -256,512,1536,7168,asm,26,3,18.7023,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,602.83,869.03,0.0004 -256,1024,1536,7168,asm,16,1,30.4337,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,740.91,706.32,0.0 -256,1536,1536,7168,asm,24,1,33.7784,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,1001.32,791.59,0.0 -256,2048,1536,7168,ck,14,0,45.9207,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,982.07,696.45,0.0001 -256,4096,1536,7168,ck,14,3,69.9692,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1289.06,756.81,0.0001 -256,8192,1536,7168,ck,13,1,115.7144,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1558.91,820.09,0.0 -256,16384,1536,7168,ck,13,3,210.0224,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1717.8,851.25,0.0 -256,20480,1536,7168,ck,13,1,262.5612,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1717.59,840.66,0.0 -256,20480,3072,1536,ck,13,2,135.0665,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1430.95,1199.45,0.0 -256,128,4096,1280,ck,6,3,6.4459,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,208.22,1001.46,0.0 -256,20480,4096,512,ck,16,0,91.452,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,939.28,1972.13,0.0 -256,20480,4608,7168,ck,13,3,706.1114,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1916.01,521.98,0.0 -256,16,7168,256,ck,5,3,2.9065,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,20.2,711.67,0.0 -256,32,7168,256,ck,10,3,3.1777,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,36.96,724.41,0.0 -256,64,7168,256,ck,12,3,3.5803,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,65.6,773.37,0.0 -256,128,7168,256,ck,7,3,4.1898,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,112.12,883.76,0.0 -256,256,7168,256,ck,12,1,5.127,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,183.25,1086.51,0.0 -256,512,7168,256,ck,12,1,6.7969,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,276.46,1369.17,0.0 -256,1024,7168,256,ck,9,3,9.8023,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,383.39,1711.56,0.0 -256,1536,7168,256,ck,9,3,12.8461,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,438.82,1887.6,0.0 -256,2048,7168,256,ck,8,2,15.6196,a8w8_blockscale_bpreshuffle_1x128x128_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,481.2,2030.74,0.0 -256,4096,7168,256,ck,9,1,26.2565,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,572.52,2346.23,0.0 -256,8192,7168,256,ck,16,2,47.1578,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,637.54,2573.76,0.0 -256,16384,7168,256,ck,16,1,85.2357,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,705.45,2826.4,0.0 -256,20480,7168,256,ck,16,0,104.9379,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,716.25,2865.31,0.0 -256,20480,7168,2048,ck,13,0,362.869,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1657.06,965.15,0.0 -256,20480,7168,2304,ck,13,2,395.8895,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1708.7,902.53,0.0 +gfx,cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,16,512,7168,asm,6,7,7.9691,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,14.74,476.98,0.0029 +gfx950,256,32,512,7168,asm,22,7,9.1625,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,25.64,429.16,0.0033 +gfx950,256,64,512,7168,asm,15,8,8.5433,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,54.99,490.95,0.0078 +gfx950,256,128,512,7168,asm,23,8,9.7489,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,96.37,484.01,0.0139 +gfx950,256,256,512,7168,asm,6,7,9.7151,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,193.42,593.63,0.0036 +gfx950,256,512,512,7168,asm,23,8,11.511,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,326.48,683.2,0.009 +gfx950,256,1024,512,7168,asm,27,4,15.6429,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,480.49,770.87,0.0012 +gfx950,256,1536,512,7168,asm,35,4,18.8821,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,597.09,860.76,0.0022 +gfx950,256,2048,512,7168,ck,17,3,21.5224,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,698.45,950.04,0.0 +gfx950,256,4096,512,7168,asm,16,1,32.8452,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,915.35,1133.33,0.0 +gfx950,256,8192,512,7168,asm,40,1,47.2238,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1273.29,1498.8,0.0 +gfx950,256,16384,512,7168,ck,13,2,81.4869,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1475.81,1692.15,0.0 +gfx950,256,20480,512,7168,ck,13,2,111.424,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1349.12,1538.65,0.0 +gfx950,256,16,576,7168,ck,7,0,17.6513,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.49,241.45,0.0 +gfx950,256,32,576,7168,ck,7,3,17.088,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,15.46,257.2,0.0 +gfx950,256,64,576,7168,ck,7,2,17.5138,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,30.18,266.15,0.0 +gfx950,256,128,576,7168,ck,7,3,16.4658,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,64.19,315.43,0.0 +gfx950,256,256,576,7168,ck,7,2,16.9154,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,124.97,370.0,0.0 +gfx950,256,512,576,7168,ck,12,3,18.0012,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,234.87,466.0,0.0 +gfx950,256,1024,576,7168,ck,17,3,20.7538,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,407.43,609.45,0.0 +gfx950,256,1536,576,7168,ck,17,2,21.2038,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,598.17,797.42,0.0 +gfx950,256,2048,576,7168,ck,12,3,31.2412,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,541.32,677.57,0.0 +gfx950,256,4096,576,7168,ck,15,0,43.7737,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,772.68,872.84,0.004 +gfx950,256,8192,576,7168,ck,15,1,75.2384,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,899.09,960.76,0.0074 +gfx950,256,16384,576,7168,ck,15,0,135.1229,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1001.25,1039.38,0.0088 +gfx950,256,20480,576,7168,ck,15,2,164.23,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1029.74,1062.67,0.0089 +gfx950,256,16,1536,7168,asm,15,8,8.9852,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,39.21,1243.59,0.0081 +gfx950,256,32,1536,7168,asm,15,8,8.8914,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,79.25,1275.13,0.0089 +gfx950,256,64,1536,7168,asm,15,8,9.5956,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,146.87,1215.7,0.0072 +gfx950,256,128,1536,7168,asm,23,8,10.3811,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,271.51,1186.85,0.0094 +gfx950,256,256,1536,7168,asm,20,5,13.4798,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,418.19,1011.25,0.0018 +gfx950,256,512,1536,7168,asm,26,3,18.7023,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,602.83,869.03,0.0004 +gfx950,256,1024,1536,7168,asm,16,1,30.4337,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,740.91,706.32,0.0 +gfx950,256,1536,1536,7168,asm,24,1,33.7784,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,1001.32,791.59,0.0 +gfx950,256,2048,1536,7168,ck,14,0,45.9207,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,982.07,696.45,0.0001 +gfx950,256,4096,1536,7168,ck,14,3,69.9692,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1289.06,756.81,0.0001 +gfx950,256,8192,1536,7168,ck,13,1,115.7144,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1558.91,820.09,0.0 +gfx950,256,16384,1536,7168,ck,13,3,210.0224,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1717.8,851.25,0.0 +gfx950,256,20480,1536,7168,ck,13,1,262.5612,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1717.59,840.66,0.0 +gfx950,256,20480,3072,1536,ck,13,2,135.0665,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1430.95,1199.45,0.0 +gfx950,256,128,4096,1280,ck,6,3,6.4459,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,208.22,1001.46,0.0 +gfx950,256,20480,4096,512,ck,16,0,91.452,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,939.28,1972.13,0.0 +gfx950,256,20480,4608,7168,ck,13,3,706.1114,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1916.01,521.98,0.0 +gfx950,256,16,7168,256,ck,5,3,2.9065,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,20.2,711.67,0.0 +gfx950,256,32,7168,256,ck,10,3,3.1777,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,36.96,724.41,0.0 +gfx950,256,64,7168,256,ck,12,3,3.5803,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,65.6,773.37,0.0 +gfx950,256,128,7168,256,ck,7,3,4.1898,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,112.12,883.76,0.0 +gfx950,256,256,7168,256,ck,12,1,5.127,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,183.25,1086.51,0.0 +gfx950,256,512,7168,256,ck,12,1,6.7969,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,276.46,1369.17,0.0 +gfx950,256,1024,7168,256,ck,9,3,9.8023,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,383.39,1711.56,0.0 +gfx950,256,1536,7168,256,ck,9,3,12.8461,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,438.82,1887.6,0.0 +gfx950,256,2048,7168,256,ck,8,2,15.6196,a8w8_blockscale_bpreshuffle_1x128x128_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,481.2,2030.74,0.0 +gfx950,256,4096,7168,256,ck,9,1,26.2565,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,572.52,2346.23,0.0 +gfx950,256,8192,7168,256,ck,16,2,47.1578,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,637.54,2573.76,0.0 +gfx950,256,16384,7168,256,ck,16,1,85.2357,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,705.45,2826.4,0.0 +gfx950,256,20480,7168,256,ck,16,0,104.9379,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,716.25,2865.31,0.0 +gfx950,256,20480,7168,2048,ck,13,0,362.869,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1657.06,965.15,0.0 +gfx950,256,20480,7168,2304,ck,13,2,395.8895,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1708.7,902.53,0.0 diff --git a/aiter/configs/a8w8_blockscale_tuned_gemm.csv b/aiter/configs/a8w8_blockscale_tuned_gemm.csv index 09aa60e37e..d3146fc1df 100644 --- a/aiter/configs/a8w8_blockscale_tuned_gemm.csv +++ b/aiter/configs/a8w8_blockscale_tuned_gemm.csv @@ -1,6 +1,6 @@ -cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,8192,512,7168,ck,0,0,64.1614,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,937.16,1103.14,0.0 -256,16384,512,7168,cktile,11,0,98.713,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1218.27,1396.85,0.0 -256,20480,512,7168,cktile,27,0,95.1492,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1579.88,1801.82,0.0 -256,128,1024,4096,ck,8,0,13.7599,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,78.03,361.97,0.0 -256,128,4096,1280,ck,7,0,7.4194,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,180.9,870.06,0.0 +gfx,cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,8192,512,7168,ck,0,0,64.1614,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,937.16,1103.14,0.0 +gfx950,256,16384,512,7168,cktile,11,0,98.713,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1218.27,1396.85,0.0 +gfx950,256,20480,512,7168,cktile,27,0,95.1492,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1579.88,1801.82,0.0 +gfx950,256,128,1024,4096,ck,8,0,13.7599,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,78.03,361.97,0.0 +gfx950,256,128,4096,1280,ck,7,0,7.4194,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,180.9,870.06,0.0 diff --git a/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv b/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv index e5d9f3e300..ff6f724a96 100644 --- a/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv +++ b/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv @@ -1,551 +1,551 @@ -cu_num,M,N,K,q_dtype_w,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -80,128,128,5120,torch.int8,asm,0,8,7.3189,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,22.92,183.56,0.182 -80,192,128,5120,torch.int8,asm,1,8,7.5835,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,33.18,222.53,0.1825 -80,128,1280,8192,torch.int8,asm,3,4,14.7532,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,181.95,804.03,0.1279 -80,192,1280,1024,torch.int8,asm,1,1,6.7934,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,74.09,294.23,0.0 -80,192,1280,5120,torch.int8,asm,5,4,14.7099,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_96x128E,171.08,545.77,0.1266 -80,192,1280,8192,torch.int8,asm,5,4,19.1633,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_96x128E,210.12,654.91,0.1288 -80,256,1280,8192,torch.int8,asm,3,2,22.3885,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,239.8,591.3,0.0662 -80,320,1280,8192,torch.int8,asm,4,2,25.8821,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,259.29,538.07,0.0666 -80,512,1280,8192,torch.int8,asm,3,1,34.2419,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,313.58,466.99,0.0 -80,1024,1280,8192,torch.int8,asm,7,1,60.6271,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,354.21,354.56,0.0 -80,2048,1280,8192,torch.int8,asm,7,1,118.4251,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,362.67,274.48,0.0 -80,4096,1280,8192,torch.int8,asm,7,1,233.2788,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,368.23,233.74,0.0 -80,8192,1280,8192,torch.int8,asm,6,1,453.0835,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,379.18,217.55,0.0 -80,16384,1280,8192,torch.int8,asm,7,1,926.2119,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,370.97,201.52,0.0 -80,64,1536,5120,torch.int8,asm,3,6,10.6691,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,94.35,786.25,0.1567 -80,80,1536,5120,torch.int8,asm,4,6,11.6667,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,107.85,730.26,0.1588 -80,128,1536,5120,torch.int8,asm,3,3,14.8921,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,135.19,598.5,0.1017 -80,150,1536,5120,torch.int8,asm,4,3,15.3652,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,153.55,591.8,0.1017 -80,192,1536,1024,torch.int8,asm,1,1,6.9538,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,86.86,339.28,0.0 -80,192,1536,5120,torch.int8,asm,3,2,17.2532,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,175.03,546.98,0.0645 -80,220,1536,5120,torch.int8,asm,4,2,18.9054,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,183.03,511.31,0.0653 -80,256,1536,5120,torch.int8,asm,5,2,20.7976,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_96x128E,193.61,478.97,0.0645 -80,384,1536,5120,torch.int8,asm,3,1,24.6804,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,244.72,446.1,0.0 -80,448,1536,5120,torch.int8,asm,4,1,27.7779,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,253.67,415.23,0.0 -80,512,1536,5120,torch.int8,asm,5,1,32.3204,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_96x128E,249.16,373.1,0.0 -80,128,8192,1024,torch.int8,asm,7,1,15.3831,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,139.6,690.16,0.0 -80,192,8192,1024,torch.int8,asm,6,1,18.2548,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,176.46,642.62,0.0 -80,192,8192,5120,torch.int8,asm,6,1,60.0549,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,268.19,767.16,0.0 -80,256,8192,1024,torch.int8,asm,7,1,28.1363,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,152.65,456.53,0.0 -80,320,8192,1024,torch.int8,asm,3,1,32.0764,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,167.37,435.18,0.0 -80,512,8192,1024,torch.int8,asm,6,1,33.8085,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,254.08,511.75,0.0 -80,1024,8192,1024,torch.int8,asm,6,1,64.2316,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,267.47,408.12,0.0 -80,2048,8192,1024,torch.int8,asm,6,1,123.9609,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,277.18,355.27,0.0 -80,4096,8192,1024,torch.int8,asm,6,1,232.1688,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,295.99,343.25,0.0 -80,8192,8192,1024,torch.int8,asm,6,1,459.697,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,298.98,328.47,0.0 -80,16384,8192,1024,torch.int8,asm,6,1,900.9508,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,305.1,325.88,0.0 -80,192,1536,5120,torch.float8_e4m3fnuz,cktile,9,0,18.9229,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,159.59,498.72,0.0 -256,64,192,1024,torch.float8_e4m3fn,flydsl,989,0,3.199,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x3_default,7.87,89.63,0.0 -256,32,384,7168,torch.float8_e4m3fn,ck,10,0,10.6799,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,16.49,281.51,0.0 -256,64,384,7168,torch.float8_e4m3fn,ck,10,0,10.0171,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,35.17,325.49,0.0 -256,96,384,7168,torch.float8_e4m3fn,ck,8,0,10.3046,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,51.29,341.05,0.0 -256,256,384,7168,torch.float8_e4m3fn,ck,10,0,9.4593,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,148.98,505.76,0.0 -256,512,384,7168,torch.float8_e4m3fn,flydsl,767,0,11.6866,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x1x2_default,241.18,583.21,0.0 -256,1024,384,7168,torch.float8_e4m3fn,flydsl,776,0,13.3134,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x1x2_default,423.42,817.14,0.0 -256,2048,384,7168,torch.float8_e4m3fn,cktile,152,0,18.6948,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,603.07,1016.62,0.0 -256,4096,384,7168,torch.float8_e4m3fn,flydsl,458,0,25.4051,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x0x1x1_default,887.56,1387.85,0.0 -256,8192,384,7168,torch.float8_e4m3fn,flydsl,792,0,34.7994,flydsl_bpreshuflle_64x192x256_F8_F8_B16_2x0x1x2_default,1295.92,1947.28,0.0 -256,16384,384,7168,torch.float8_e4m3fn,ck,149,0,54.0722,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1668.03,2455.53,0.0 -256,1,800,5120,torch.float8_e4m3fn,ck,10,0,8.6917,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.94,472.03,0.0 -256,16,800,5120,torch.float8_e4m3fn,ck,10,0,8.2944,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,15.8,506.79,0.0 -256,32,800,5120,torch.float8_e4m3fn,ck,8,0,8.5999,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,30.48,501.29,0.0 -256,64,800,5120,torch.float8_e4m3fn,ck,10,0,8.3239,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,62.99,543.75,0.0 -256,128,800,5120,torch.float8_e4m3fn,ck,24,0,8.2591,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,126.96,600.08,0.0 -256,256,800,5120,torch.float8_e4m3fn,ck,10,0,9.7206,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,215.74,598.35,0.0 -256,512,800,5120,torch.float8_e4m3fn,ck,10,0,14.8326,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,282.78,508.11,0.0 -256,1024,800,5120,torch.float8_e4m3fn,ck,10,0,22.0756,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,379.99,497.26,0.0 -256,2048,800,5120,torch.float8_e4m3fn,ck,10,0,39.1074,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,429.0,456.65,0.0 -256,4096,800,5120,torch.float8_e4m3fn,ck,69,0,42.7898,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,784.17,738.99,0.0 -256,8192,800,5120,torch.float8_e4m3fn,ck,69,0,55.2653,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1214.3,1070.22,0.0 -256,16384,800,5120,torch.float8_e4m3fn,ck,69,0,96.4525,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1391.54,1183.97,0.0 -256,32768,800,5120,torch.float8_e4m3fn,ck,57,0,170.0227,a8w8_bpreshuffle_256x160x160x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_4x4x1_1x1_intrawave_v3,1578.82,1319.22,0.0 -256,96,1024,7168,torch.float8_e4m3fn,ck,10,0,10.9919,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,128.21,748.26,0.0 -256,128,1024,7168,torch.float8_e4m3fn,ck,8,0,11.0339,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,170.3,772.14,0.0 -256,256,1024,7168,torch.float8_e4m3fn,ck,11,0,11.9595,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,314.24,811.01,0.0 -256,512,1024,7168,torch.float8_e4m3fn,cktile,9,0,13.7695,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x128_default,545.86,875.75,0.0 -256,1024,1024,7168,torch.float8_e4m3fn,ck,114,0,19.6079,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,766.65,855.64,0.0 -256,2048,1024,7168,torch.float8_e4m3fn,flydsl,626,0,27.5555,flydsl_bpreshuflle_64x128x256_F8_F8_B16_2x1x1x1_default,1091.06,951.33,0.0 -256,4096,1024,7168,torch.float8_e4m3fn,flydsl,1280,0,40.8095,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x1x3_default,1473.42,1104.86,0.0 -256,8192,1024,7168,torch.float8_e4m3fn,ck,154,0,62.6397,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1919.85,1322.44,0.0 -256,16384,1024,7168,torch.float8_e4m3fn,ck,33,0,110.6249,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2174.18,1431.28,0.0 -256,1,1280,8192,torch.float8_e4m3fn,ck,10,0,12.7699,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.64,821.97,0.0 -256,16,1280,8192,torch.float8_e4m3fn,ck,10,0,12.5805,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,26.67,847.17,0.0 -256,32,1280,8192,torch.float8_e4m3fn,ck,10,0,11.5596,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,58.05,936.87,0.0 -256,64,1280,8192,torch.float8_e4m3fn,ck,8,0,11.6259,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,115.45,961.12,0.0 -256,128,1280,8192,torch.float8_e4m3fn,ck,11,0,12.3468,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,217.41,960.74,0.0 -256,256,1280,8192,torch.float8_e4m3fn,cktile,9,0,14.9227,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x128_default,359.77,887.12,0.0 -256,512,1280,8192,torch.float8_e4m3fn,ck,114,0,20.8376,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,515.29,767.4,0.0 -256,1024,1280,8192,torch.float8_e4m3fn,cktile,216,0,27.0331,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_48x64x256_1x4x1_16x16x128_default,794.39,795.17,0.0 -256,2048,1280,8192,torch.float8_e4m3fn,cktile,99,0,36.1565,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x128x256_1x4x1_16x16x128_default,1187.88,899.03,0.0 -256,4096,1280,8192,torch.float8_e4m3fn,ck,138,0,57.904,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1483.48,941.66,0.0 -256,8192,1280,8192,torch.float8_e4m3fn,ck,51,0,94.2147,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1823.48,1046.19,0.0 -256,16384,1280,8192,torch.float8_e4m3fn,ck,154,0,170.1443,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2019.45,1096.99,0.0 -256,32768,1280,8192,torch.float8_e4m3fn,ck,154,0,293.2178,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2343.63,1237.33,0.0 -256,1,2304,16384,torch.float8_e4m3fn,cktile,2,0,23.5282,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,3.21,1605.3,0.0 -256,16,2304,16384,torch.float8_e4m3fn,ck,10,0,21.4115,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,56.42,1778.7,0.0 -256,32,2304,16384,torch.float8_e4m3fn,ck,10,0,21.3018,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,113.41,1803.63,0.0 -256,64,2304,16384,torch.float8_e4m3fn,ck,5,0,23.2067,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,208.21,1684.52,0.0 -256,128,2304,16384,torch.float8_e4m3fn,cktile,152,0,27.897,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,346.41,1449.46,0.0 -256,256,2304,16384,torch.float8_e4m3fn,ck,114,0,39.2871,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,491.95,1097.63,0.0 -256,512,2304,16384,torch.float8_e4m3fn,cktile,117,0,48.8917,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_80x64x256_1x4x1_16x16x128_default,790.62,991.92,0.0 -256,1024,2304,16384,torch.float8_e4m3fn,cktile,92,0,64.6028,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_80x128x256_1x4x1_16x16x128_default,1196.69,917.06,0.0 -256,2048,2304,16384,torch.float8_e4m3fn,cktile,132,0,93.6614,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x128x256_1x4x1_16x16x128_default,1650.83,862.05,0.0 -256,4096,2304,16384,torch.float8_e4m3fn,cktile,89,0,140.171,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_224x192x128_1x4x1_16x16x128_default,2206.15,882.72,0.0 -256,8192,2304,16384,torch.float8_e4m3fn,cktile,55,0,261.4037,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x192x128_1x4x1_16x16x128_default,2365.98,802.27,0.0 -256,16384,2304,16384,torch.float8_e4m3fn,cktile,234,0,474.9339,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x192x128_1x4x1_16x16x128_default,2604.47,803.65,0.0 -256,32768,2304,16384,torch.float8_e4m3fn,cktile,234,0,941.4041,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x192x128_1x4x1_16x16x128_default,2627.88,770.78,0.0 -256,1,2560,8192,torch.float8_e4m3fn,ck,10,0,12.9563,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,3.24,1619.66,0.0 -256,16,2560,8192,torch.float8_e4m3fn,ck,10,0,11.7435,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,57.15,1803.94,0.0 -256,32,2560,8192,torch.float8_e4m3fn,ck,10,0,11.955,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,112.27,1789.84,0.0 -256,64,2560,8192,torch.float8_e4m3fn,ck,11,0,12.9931,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,206.6,1679.62,0.0 -256,128,2560,8192,torch.float8_e4m3fn,cktile,152,0,15.9038,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,337.57,1425.79,0.0 -256,256,2560,8192,torch.float8_e4m3fn,ck,114,0,21.9372,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,489.46,1111.33,0.0 -256,512,2560,8192,torch.float8_e4m3fn,cktile,217,0,27.7332,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_48x64x256_1x4x1_16x16x128_default,774.34,1001.95,0.0 -256,1024,2560,8192,torch.float8_e4m3fn,cktile,253,0,36.6417,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_96x128x256_1x4x1_16x16x128_default,1172.15,944.36,0.0 -256,2048,2560,8192,torch.float8_e4m3fn,cktile,119,0,57.5848,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_112x128x256_1x4x1_16x16x128_default,1491.7,837.63,0.0 -256,4096,2560,8192,torch.float8_e4m3fn,ck,51,0,93.1071,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1845.17,810.87,0.0 -256,8192,2560,8192,torch.float8_e4m3fn,flydsl,976,0,163.6521,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x1x2_default,2099.56,794.51,0.0 -256,16384,2560,8192,torch.float8_e4m3fn,flydsl,979,0,284.3899,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2416.38,840.66,0.0 -256,32768,2560,8192,torch.float8_e4m3fn,ck,33,0,548.7549,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2504.56,833.12,0.0 -256,1,4608,16384,torch.float8_e4m3fn,flydsl,767,0,24.8086,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x1x2_default,6.09,3044.23,0.0 -256,16,4608,16384,torch.float8_e4m3fn,ck,10,0,22.6718,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,106.56,3348.08,0.0 -256,32,4608,16384,torch.float8_e4m3fn,ck,11,0,25.5291,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,189.27,2989.4,0.0 -256,64,4608,16384,torch.float8_e4m3fn,cktile,37,0,30.5476,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,316.35,2525.1,0.0 -256,128,4608,16384,torch.float8_e4m3fn,ck,114,0,41.2697,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,468.32,1908.77,0.0 -256,256,4608,16384,torch.float8_e4m3fn,cktile,226,0,50.4918,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_96x64x256_1x4x1_16x16x128_default,765.56,1625.04,0.0 -256,512,4608,16384,torch.float8_e4m3fn,cktile,92,0,64.2676,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_80x128x256_1x4x1_16x16x128_default,1202.93,1378.68,0.0 -256,1024,4608,16384,torch.float8_e4m3fn,ck,149,0,93.1125,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1660.56,1092.35,0.0 -256,2048,4608,16384,torch.float8_e4m3fn,cktile,232,0,142.0215,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_224x192x128_1x4x1_16x16x128_default,2177.4,900.75,0.0 -256,4096,4608,16384,torch.float8_e4m3fn,flydsl,815,0,259.8623,flydsl_bpreshuflle_128x192x128_F8_F8_B16_2x0x1x2_default,2380.01,694.04,0.0 -256,8192,4608,16384,torch.float8_e4m3fn,cktile,235,0,472.9169,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x192x128_1x4x1_16x16x128_default,2615.58,603.09,0.0 -256,16384,4608,16384,torch.float8_e4m3fn,cktile,121,0,915.2083,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x128_default,2703.1,540.78,0.0 -256,32768,4608,16384,torch.float8_e4m3fn,cktile,10,0,1778.3005,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2782.32,514.18,0.0 -256,1,5120,640,torch.float8_e4m3fn,ck,9,0,5.2374,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.25,627.73,0.0 -256,1,5120,1280,torch.float8_e4m3fn,flydsl,415,0,5.2116,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x1x1_default,2.52,1259.71,0.0 -256,1,5120,3200,torch.float8_e4m3fn,ck,9,0,13.8435,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.37,1184.49,0.0 -256,1,5120,5120,torch.float8_e4m3fn,ck,10,0,8.9938,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.83,2916.43,0.0 -256,1,5120,6400,torch.float8_e4m3fn,flydsl,1213,0,15.5391,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x1x1x3_default,4.22,2109.82,0.0 -256,1,5120,25600,torch.float8_e4m3fn,flydsl,1159,0,36.6584,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x3_default,7.15,3576.47,0.0 -256,16,5120,640,torch.float8_e4m3fn,ck,9,0,5.3278,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,19.68,647.71,0.0 -256,16,5120,1280,torch.float8_e4m3fn,flydsl,83,0,5.3531,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x1x0_default,39.18,1258.69,0.0 -256,16,5120,3200,torch.float8_e4m3fn,ck,9,0,14.0198,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,37.4,1183.97,0.0 -256,16,5120,5120,torch.float8_e4m3fn,ck,10,0,9.0428,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,92.77,2926.1,0.0 -256,16,5120,6400,torch.float8_e4m3fn,flydsl,415,0,15.5054,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x1x1_default,67.63,2130.5,0.0 -256,16,5120,25600,torch.float8_e4m3fn,ck,10,0,36.6833,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,114.34,3588.7,0.0 -256,32,5120,640,torch.float8_e4m3fn,flydsl,1316,0,4.9324,flydsl_bpreshuflle_32x64x128_F8_F8_B16_2x0x0x4_default,42.52,734.93,0.0 -256,32,5120,1280,torch.float8_e4m3fn,flydsl,664,0,5.4128,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x0x2_default,77.49,1278.86,0.0 -256,32,5120,3200,torch.float8_e4m3fn,ck,9,0,14.3633,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,73.0,1170.63,0.0 -256,32,5120,5120,torch.float8_e4m3fn,ck,5,0,10.6609,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,157.37,2505.03,0.0 -256,32,5120,6400,torch.float8_e4m3fn,flydsl,415,0,15.8728,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x1x1_default,132.12,2097.96,0.0 -256,32,5120,25600,torch.float8_e4m3fn,ck,11,0,39.0845,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,214.63,3382.9,0.0 -256,64,5120,640,torch.float8_e4m3fn,flydsl,422,0,5.0513,flydsl_bpreshuflle_32x64x128_F8_F8_B16_1x0x1x1_default,83.03,786.55,0.0 -256,64,5120,1280,torch.float8_e4m3fn,flydsl,91,0,5.5934,flydsl_bpreshuflle_32x64x256_F8_F8_B16_1x0x1x0_default,149.97,1303.48,0.0 -256,64,5120,3200,torch.float8_e4m3fn,ck,9,0,14.7686,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,142.0,1167.62,0.0 -256,64,5120,5120,torch.float8_e4m3fn,flydsl,529,0,12.134,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x1x0x1_default,276.53,2241.42,0.0 -256,64,5120,6400,torch.float8_e4m3fn,flydsl,506,0,17.9065,flydsl_bpreshuflle_32x64x256_F8_F8_B16_1x1x0x1_default,234.23,1889.42,0.0 -256,64,5120,25600,torch.float8_e4m3fn,cktile,37,0,46.4046,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,361.54,2873.98,0.0 -256,128,5120,640,torch.float8_e4m3fn,flydsl,1365,0,5.5954,flydsl_bpreshuflle_32x128x128_F8_F8_B16_1x0x1x4_default,149.92,834.51,0.0 -256,128,5120,1280,torch.float8_e4m3fn,flydsl,541,0,6.5611,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x1x0x1_default,255.71,1223.6,0.0 -256,128,5120,3200,torch.float8_e4m3fn,ck,9,0,14.9077,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,281.35,1214.43,0.0 -256,128,5120,5120,torch.float8_e4m3fn,ck,114,0,15.9196,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,421.55,1770.18,0.0 -256,128,5120,6400,torch.float8_e4m3fn,ck,114,0,18.6977,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,448.64,1866.43,0.0 -256,128,5120,25600,torch.float8_e4m3fn,ck,114,0,63.2614,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,530.41,2144.43,0.0 -256,256,5120,640,torch.float8_e4m3fn,flydsl,1002,0,6.5154,flydsl_bpreshuflle_64x128x128_F8_F8_B16_1x0x0x3_default,257.5,930.42,0.0 -256,256,5120,1280,torch.float8_e4m3fn,ck,113,0,8.4096,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,399.0,1129.98,0.0 -256,256,5120,3200,torch.float8_e4m3fn,ck,76,0,16.7712,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,500.18,1182.06,0.0 -256,256,5120,5120,torch.float8_e4m3fn,ck,120,0,19.4182,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,691.2,1552.49,0.0 -256,256,5120,6400,torch.float8_e4m3fn,ck,120,0,23.2983,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,720.1,1589.29,0.0 -256,256,5120,25600,torch.float8_e4m3fn,cktile,227,0,77.1726,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_96x64x256_1x4x1_16x16x128_default,869.59,1817.32,0.0 -256,512,5120,640,torch.float8_e4m3fn,flydsl,1179,0,7.8677,flydsl_bpreshuflle_64x64x128_F8_F8_B16_2x1x0x3_default,426.48,1124.52,0.0 -256,512,5120,1280,torch.float8_e4m3fn,flydsl,814,0,10.5382,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x2_default,636.82,1181.59,0.0 -256,512,5120,3200,torch.float8_e4m3fn,ck,86,0,21.8373,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,768.28,1065.39,0.0 -256,512,5120,5120,torch.float8_e4m3fn,ck,123,0,26.3712,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1017.91,1292.27,0.0 -256,512,5120,6400,torch.float8_e4m3fn,ck,123,0,30.9752,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1083.27,1332.93,0.0 -256,512,5120,25600,torch.float8_e4m3fn,cktile,99,0,96.9889,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x128x256_1x4x1_16x16x128_default,1383.85,1540.61,0.0 -256,1024,5120,640,torch.float8_e4m3fn,flydsl,376,0,10.9979,flydsl_bpreshuflle_64x128x128_F8_F8_B16_2x0x0x1_default,610.2,1310.97,0.0 -256,1024,5120,1280,torch.float8_e4m3fn,ck,123,0,15.4162,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,870.63,1190.31,0.0 -256,1024,5120,3200,torch.float8_e4m3fn,ck,86,0,27.9273,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1201.49,1079.47,0.0 -256,1024,5120,5120,torch.float8_e4m3fn,ck,51,0,40.9005,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1312.63,1025.49,0.0 -256,1024,5120,6400,torch.float8_e4m3fn,ck,51,0,47.925,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1400.29,1039.28,0.0 -256,1024,5120,25600,torch.float8_e4m3fn,cktile,106,0,150.9142,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x256x256_1x4x1_16x16x128_default,1778.73,1111.71,0.0 -256,2048,5120,640,torch.float8_e4m3fn,flydsl,151,0,15.6989,flydsl_bpreshuflle_128x128x128_F8_F8_B16_2x0x1x0_default,854.95,1628.08,0.0 -256,2048,5120,1280,torch.float8_e4m3fn,flydsl,814,0,22.7992,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x2_default,1177.39,1322.26,0.0 -256,2048,5120,3200,torch.float8_e4m3fn,ck,51,0,43.7434,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1534.15,1003.79,0.0 -256,2048,5120,5120,torch.float8_e4m3fn,ck,143,0,61.2716,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1752.43,941.25,0.0 -256,2048,5120,6400,torch.float8_e4m3fn,ck,143,0,74.2562,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1807.5,900.22,0.0 -256,2048,5120,25600,torch.float8_e4m3fn,cktile,10,0,244.7836,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2193.25,835.32,0.0 -256,4096,5120,640,torch.float8_e4m3fn,flydsl,267,0,24.8734,flydsl_bpreshuflle_128x128x128_F8_F8_B16_1x1x1x0_default,1079.21,1923.39,0.0 -256,4096,5120,1280,torch.float8_e4m3fn,flydsl,1204,0,37.3618,flydsl_bpreshuflle_128x128x128_F8_F8_B16_2x1x0x3_default,1436.95,1438.35,0.0 -256,4096,5120,3200,torch.float8_e4m3fn,flydsl,1279,0,75.9837,flydsl_bpreshuflle_128x128x128_F8_F8_B16_2x1x1x3_default,1766.4,940.13,0.0 -256,4096,5120,5120,torch.float8_e4m3fn,flydsl,814,0,111.8359,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x2_default,1920.21,796.96,0.0 -256,4096,5120,6400,torch.float8_e4m3fn,flydsl,152,0,133.987,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x0_default,2003.44,753.25,0.0 -256,4096,5120,25600,torch.float8_e4m3fn,cktile,130,0,453.567,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_224x128x128_1x4x1_16x16x128_default,2367.33,612.64,0.0 -256,8192,5120,640,torch.float8_e4m3fn,flydsl,898,0,45.6191,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x0x2_default,1176.86,2025.59,0.0 -256,8192,5120,1280,torch.float8_e4m3fn,flydsl,825,0,71.4842,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,1502.07,1411.86,0.0 -256,8192,5120,3200,torch.float8_e4m3fn,flydsl,321,0,128.779,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x0_default,2084.47,982.18,0.0 -256,8192,5120,5120,torch.float8_e4m3fn,flydsl,825,0,192.9599,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2225.83,787.95,0.0 -256,8192,5120,6400,torch.float8_e4m3fn,flydsl,825,0,229.0136,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2344.28,738.31,0.0 -256,8192,5120,25600,torch.float8_e4m3fn,cktile,10,0,825.5439,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2601.3,514.42,0.0 -256,16384,5120,640,torch.float8_e4m3fn,flydsl,898,0,83.2532,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x0x2_default,1289.73,2180.51,0.0 -256,16384,5120,1280,torch.float8_e4m3fn,flydsl,435,0,130.1539,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x0x1x1_default,1649.96,1500.51,0.0 -256,16384,5120,3200,torch.float8_e4m3fn,flydsl,321,0,238.792,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x0_default,2248.28,990.76,0.0 -256,16384,5120,5120,torch.float8_e4m3fn,flydsl,825,0,371.1527,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2314.39,748.67,0.0 -256,16384,5120,6400,torch.float8_e4m3fn,flydsl,825,0,439.5686,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2442.72,694.77,0.0 -256,16384,5120,25600,torch.float8_e4m3fn,cktile,10,0,1510.0434,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2844.27,475.66,0.0 -256,32768,5120,640,torch.float8_e4m3fn,flydsl,736,0,161.3587,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x0x0x2_default,1330.88,2229.77,0.0 -256,32768,5120,1280,torch.float8_e4m3fn,flydsl,103,0,262.1888,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x0x1x0_default,1638.12,1464.75,0.0 -256,32768,5120,3200,torch.float8_e4m3fn,flydsl,653,0,489.6873,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x1_default,2192.71,932.81,0.0 -256,32768,5120,5120,torch.float8_e4m3fn,ck,33,0,741.3936,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2317.24,714.24,0.0 -256,32768,5120,6400,torch.float8_e4m3fn,ck,33,0,875.2866,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2453.46,660.39,0.0 -256,32768,5120,25600,torch.float8_e4m3fn,cktile,10,0,2964.8617,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2897.25,440.32,0.0 -256,1,6400,5120,torch.float8_e4m3fn,flydsl,1443,0,10.0031,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x4_default,6.55,3277.58,0.0 -256,16,6400,5120,torch.float8_e4m3fn,ck,10,0,9.0378,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,116.02,3657.39,0.0 -256,32,6400,5120,torch.float8_e4m3fn,cktile,2,0,11.1359,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,188.32,2994.05,0.0 -256,64,6400,5120,torch.float8_e4m3fn,cktile,152,0,12.8013,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,327.65,2649.33,0.0 -256,128,6400,5120,torch.float8_e4m3fn,ck,114,0,16.3798,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,512.13,2140.55,0.0 -256,256,6400,5120,torch.float8_e4m3fn,ck,65,0,21.7673,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,770.75,1716.13,0.0 -256,512,6400,5120,torch.float8_e4m3fn,ck,139,0,29.5786,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1134.42,1418.02,0.0 -256,1024,6400,5120,torch.float8_e4m3fn,ck,138,0,45.3094,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1481.12,1128.2,0.0 -256,2048,6400,5120,torch.float8_e4m3fn,ck,138,0,81.0869,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1655.23,856.71,0.0 -256,4096,6400,5120,torch.float8_e4m3fn,ck,143,0,142.3021,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1886.38,746.08,0.0 -256,8192,6400,5120,torch.float8_e4m3fn,flydsl,825,0,248.0387,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2164.46,723.95,0.0 -256,16384,6400,5120,torch.float8_e4m3fn,flydsl,825,0,466.6318,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2301.05,699.41,0.0 -256,32768,6400,5120,torch.float8_e4m3fn,flydsl,825,0,932.2305,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2303.6,665.04,0.0 -256,8,6656,16384,torch.float8_e4m3fn,ck,10,0,26.6859,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,65.38,4095.4,0.0 -256,1,7168,8192,torch.float8_e4m3fn,ck,10,0,14.4987,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,8.1,4051.59,0.0 -256,16,7168,8192,torch.float8_e4m3fn,ck,24,0,14.8554,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,126.49,3977.05,0.0 -256,32,7168,256,torch.float8_e4m3fn,flydsl,1071,0,2.8866,flydsl_bpreshuflle_32x64x256_F8_F8_B16_1x0x1x3_default,40.68,797.46,0.0 -256,32,7168,8192,torch.float8_e4m3fn,ck,5,0,15.5595,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,241.53,3820.25,0.0 -256,64,7168,256,torch.float8_e4m3fn,flydsl,1071,0,3.0972,flydsl_bpreshuflle_32x64x256_F8_F8_B16_1x0x1x3_default,75.84,894.0,0.0 -256,64,7168,8192,torch.float8_e4m3fn,cktile,152,0,18.9201,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,397.26,3179.8,0.0 -256,96,7168,256,torch.float8_e4m3fn,flydsl,753,0,3.597,flydsl_bpreshuflle_32x64x256_F8_F8_B16_1x0x1x2_default,97.95,899.59,0.0 -256,96,7168,512,torch.float8_e4m3fn,flydsl,1299,0,4.5415,flydsl_bpreshuflle_32x128x256_F8_F8_B16_1x0x0x4_default,155.16,1121.97,0.0 -256,128,7168,256,torch.float8_e4m3fn,flydsl,428,0,3.563,flydsl_bpreshuflle_64x64x256_F8_F8_B16_1x0x1x1_default,131.84,1039.23,0.0 -256,128,7168,512,torch.float8_e4m3fn,flydsl,758,0,4.5422,flydsl_bpreshuflle_64x64x256_F8_F8_B16_1x0x1x2_default,206.84,1226.4,0.0 -256,128,7168,8192,torch.float8_e4m3fn,ck,114,0,24.0436,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,625.21,2562.17,0.0 -256,256,7168,256,torch.float8_e4m3fn,flydsl,1368,0,4.5468,flydsl_bpreshuflle_64x64x256_F8_F8_B16_1x0x1x4_default,206.63,1225.16,0.0 -256,256,7168,512,torch.float8_e4m3fn,flydsl,1078,0,5.8237,flydsl_bpreshuflle_64x128x256_F8_F8_B16_1x0x1x3_default,322.66,1282.88,0.0 -256,256,7168,8192,torch.float8_e4m3fn,ck,144,0,30.8344,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,975.04,2091.41,0.0 -256,512,7168,256,torch.float8_e4m3fn,flydsl,1006,0,6.0654,flydsl_bpreshuflle_128x128x128_F8_F8_B16_1x0x0x3_default,309.8,1534.29,0.0 -256,512,7168,8192,torch.float8_e4m3fn,ck,139,0,43.6511,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1377.5,1609.46,0.0 -256,1024,7168,256,torch.float8_e4m3fn,flydsl,1182,0,8.8106,flydsl_bpreshuflle_64x128x256_F8_F8_B16_2x1x0x3_default,426.54,1904.21,0.0 -256,1024,7168,512,torch.float8_e4m3fn,flydsl,16,0,11.3748,flydsl_bpreshuflle_64x256x128_F8_F8_B16_1x0x0x0_default,660.78,1659.31,0.0 -256,1024,7168,8192,torch.float8_e4m3fn,ck,154,0,65.1203,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1846.72,1255.97,0.0 -256,2048,7168,256,torch.float8_e4m3fn,flydsl,927,0,13.0037,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x2_default,578.0,2439.26,0.0 -256,2048,7168,8192,torch.float8_e4m3fn,cktile,155,0,113.909,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2111.49,920.54,0.0 -256,4096,7168,256,torch.float8_e4m3fn,flydsl,976,0,22.1568,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x1x2_default,678.45,2780.36,0.0 -256,4096,7168,512,torch.float8_e4m3fn,flydsl,186,0,31.9569,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x0_default,940.79,2017.95,0.0 -256,4096,7168,8192,torch.float8_e4m3fn,flydsl,979,0,212.6801,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2261.78,709.96,0.0 -256,8192,7168,256,torch.float8_e4m3fn,flydsl,650,0,40.7412,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x1x1_default,737.95,2979.11,0.0 -256,8192,7168,8192,torch.float8_e4m3fn,flydsl,825,0,384.7901,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2500.25,632.21,0.0 -256,16384,7168,256,torch.float8_e4m3fn,flydsl,1205,0,75.8308,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x0x3_default,792.94,3176.95,0.0 -256,16384,7168,512,torch.float8_e4m3fn,flydsl,1533,0,107.5937,flydsl_bpreshuflle_64x128x256_F8_F8_B16_2x1x1x4_default,1117.71,2295.11,0.0 -256,16384,7168,8192,torch.float8_e4m3fn,flydsl,979,0,746.7473,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2576.7,572.91,0.0 -256,32768,7168,8192,torch.float8_e4m3fn,ck,33,0,1495.0753,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2573.98,533.03,0.0 -256,1,8192,1024,torch.float8_e4m3fn,flydsl,1,0,4.4273,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x0_default,3.79,1898.68,0.0 -256,1,8192,2048,torch.float8_e4m3fn,flydsl,827,0,5.6823,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x1x0x2_default,5.91,2955.78,0.0 -256,1,8192,3584,torch.float8_e4m3fn,ck,10,0,7.9608,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,7.38,3690.6,0.0 -256,1,8192,7168,torch.float8_e4m3fn,flydsl,848,0,13.2795,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x2_default,8.84,4423.65,0.0 -256,1,8192,8192,torch.float8_e4m3fn,ck,24,0,15.2454,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,8.8,4403.52,0.0 -256,1,8192,28672,torch.float8_e4m3fn,flydsl,1159,0,45.1638,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x3_default,10.4,5201.65,0.0 -256,16,8192,1024,torch.float8_e4m3fn,flydsl,1289,0,4.409,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x4_default,60.88,1965.78,0.0 -256,16,8192,2048,torch.float8_e4m3fn,flydsl,1,0,5.5976,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x0_default,95.91,3049.9,0.0 -256,16,8192,3584,torch.float8_e4m3fn,ck,10,0,7.841,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,119.82,3785.18,0.0 -256,16,8192,7168,torch.float8_e4m3fn,ck,10,0,13.2832,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,141.46,4449.01,0.0 -256,16,8192,8192,torch.float8_e4m3fn,ck,24,0,15.4927,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,138.61,4357.02,0.0 -256,16,8192,28672,torch.float8_e4m3fn,ck,10,0,43.6421,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,172.22,5398.5,0.0 -256,32,8192,1024,torch.float8_e4m3fn,flydsl,673,0,4.8647,flydsl_bpreshuflle_32x64x512_F8_F8_B16_1x0x0x2_default,110.36,1838.89,0.0 -256,32,8192,2048,torch.float8_e4m3fn,ck,5,0,6.3939,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,167.93,2716.19,0.0 -256,32,8192,3584,torch.float8_e4m3fn,ck,5,0,8.8319,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,212.76,3396.68,0.0 -256,32,8192,7168,torch.float8_e4m3fn,ck,11,0,14.6614,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,256.33,4056.5,0.0 -256,32,8192,8192,torch.float8_e4m3fn,ck,5,0,16.4179,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,261.6,4135.44,0.0 -256,32,8192,28672,torch.float8_e4m3fn,ck,5,0,48.1626,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,312.12,4906.77,0.0 -256,64,8192,2048,torch.float8_e4m3fn,cktile,37,0,7.4625,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,287.77,2406.28,0.0 -256,64,8192,3584,torch.float8_e4m3fn,cktile,37,0,10.8507,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,346.35,2823.6,0.0 -256,64,8192,7168,torch.float8_e4m3fn,cktile,37,0,17.4069,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,431.79,3459.98,0.0 -256,64,8192,8192,torch.float8_e4m3fn,cktile,37,0,19.3604,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,443.69,3547.54,0.0 -256,64,8192,28672,torch.float8_e4m3fn,cktile,151,0,56.6791,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_32x64x512_1x4x1_16x16x128_default,530.44,4194.93,0.0 -256,128,8192,1024,torch.float8_e4m3fn,flydsl,677,0,6.3283,flydsl_bpreshuflle_64x64x256_F8_F8_B16_1x0x0x2_default,339.35,1677.68,0.0 -256,128,8192,2048,torch.float8_e4m3fn,flydsl,869,0,9.304,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x1x0x2_default,461.63,2056.8,0.0 -256,128,8192,3584,torch.float8_e4m3fn,ck,114,0,13.2791,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,566.02,2403.48,0.0 -256,128,8192,7168,torch.float8_e4m3fn,ck,114,0,22.4532,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,669.5,2749.49,0.0 -256,128,8192,8192,torch.float8_e4m3fn,ck,114,0,25.4862,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,674.09,2756.57,0.0 -256,128,8192,28672,torch.float8_e4m3fn,ck,114,0,75.9032,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,792.19,3170.46,0.0 -256,256,8192,1024,torch.float8_e4m3fn,flydsl,1128,0,8.3043,flydsl_bpreshuflle_128x64x256_F8_F8_B16_2x0x1x3_default,517.2,1546.8,0.0 -256,256,8192,2048,torch.float8_e4m3fn,flydsl,1203,0,11.5596,flydsl_bpreshuflle_128x64x256_F8_F8_B16_2x1x0x3_default,743.1,1859.56,0.0 -256,256,8192,3584,torch.float8_e4m3fn,ck,65,0,17.0541,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,881.45,2021.33,0.0 -256,256,8192,7168,torch.float8_e4m3fn,ck,65,0,28.8311,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1042.79,2245.82,0.0 -256,256,8192,8192,torch.float8_e4m3fn,ck,144,0,31.9715,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1074.7,2295.8,0.0 -256,256,8192,28672,torch.float8_e4m3fn,ck,144,0,97.4398,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1234.19,2528.9,0.0 -256,512,8192,1024,torch.float8_e4m3fn,flydsl,1418,0,11.2514,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x4_default,763.45,1537.72,0.0 -256,512,8192,2048,torch.float8_e4m3fn,ck,139,0,15.9434,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1077.55,1644.22,0.0 -256,512,8192,3584,torch.float8_e4m3fn,ck,139,0,23.5971,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1274.09,1677.48,0.0 -256,512,8192,7168,torch.float8_e4m3fn,cktile,135,0,40.5772,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_128x128x256_1x4x1_16x16x128_default,1481.86,1744.3,0.0 -256,512,8192,8192,torch.float8_e4m3fn,cktile,135,0,45.0861,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_128x128x256_1x4x1_16x16x128_default,1524.18,1767.55,0.0 -256,512,8192,28672,torch.float8_e4m3fn,cktile,135,0,129.391,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_128x128x256_1x4x1_16x16x128_default,1858.85,1993.57,0.0 -256,1024,8192,1024,torch.float8_e4m3fn,flydsl,925,0,16.7804,flydsl_bpreshuflle_128x128x128_F8_F8_B16_1x1x1x2_default,1023.81,1562.2,0.0 -256,1024,8192,2048,torch.float8_e4m3fn,ck,154,0,25.4967,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1347.62,1398.28,0.0 -256,1024,8192,3584,torch.float8_e4m3fn,ck,154,0,36.3958,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1652.1,1368.49,0.0 -256,1024,8192,7168,torch.float8_e4m3fn,ck,154,0,61.9703,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1940.59,1336.73,0.0 -256,1024,8192,8192,torch.float8_e4m3fn,ck,154,0,69.8202,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1968.47,1321.6,0.0 -256,1024,8192,28672,torch.float8_e4m3fn,ck,154,0,201.8712,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2382.89,1392.07,0.0 -256,2048,8192,1024,torch.float8_e4m3fn,flydsl,846,0,23.7296,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x2_default,1447.97,1855.92,0.0 -256,2048,8192,2048,torch.float8_e4m3fn,flydsl,269,0,42.4634,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x0_default,1618.32,1284.07,0.0 -256,2048,8192,3584,torch.float8_e4m3fn,flydsl,684,0,66.9636,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x0x0x2_default,1795.89,1049.15,0.0 -256,2048,8192,7168,torch.float8_e4m3fn,ck,33,0,108.3244,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2220.35,987.36,0.0 -256,2048,8192,8192,torch.float8_e4m3fn,cktile,155,0,122.2729,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2248.07,960.48,0.0 -256,2048,8192,28672,torch.float8_e4m3fn,cktile,154,0,329.0964,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2923.38,994.1,0.0 -256,4096,8192,1024,torch.float8_e4m3fn,flydsl,518,0,47.2123,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x1_default,1455.54,1687.95,0.0 -256,4096,8192,2048,torch.float8_e4m3fn,flydsl,601,0,76.8953,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x1_default,1787.35,1200.0,0.0 -256,4096,8192,3584,torch.float8_e4m3fn,flydsl,825,0,119.4876,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2012.91,930.21,0.0 -256,4096,8192,7168,torch.float8_e4m3fn,flydsl,825,0,206.4213,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2330.36,751.81,0.0 -256,4096,8192,8192,torch.float8_e4m3fn,flydsl,979,0,228.5549,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2405.36,734.06,0.0 -256,4096,8192,28672,torch.float8_e4m3fn,cktile,10,0,643.0461,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2992.24,652.26,0.0 -256,8192,8192,1024,torch.float8_e4m3fn,flydsl,186,0,95.7087,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x0_default,1436.01,1577.65,0.0 -256,8192,8192,2048,torch.float8_e4m3fn,flydsl,927,0,146.7966,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x2_default,1872.51,1142.89,0.0 -256,8192,8192,3584,torch.float8_e4m3fn,flydsl,825,0,216.5513,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2221.35,890.96,0.0 -256,8192,8192,7168,torch.float8_e4m3fn,flydsl,825,0,387.3297,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2483.86,649.73,0.0 -256,8192,8192,8192,torch.float8_e4m3fn,flydsl,979,0,435.3376,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2525.65,616.61,0.0 -256,8192,8192,28672,torch.float8_e4m3fn,cktile,10,0,1294.8599,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2971.97,466.44,0.0 -256,16384,8192,1024,torch.float8_e4m3fn,flydsl,927,0,181.2201,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x2_default,1516.82,1620.14,0.0 -256,16384,8192,2048,torch.float8_e4m3fn,flydsl,103,0,287.1698,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x0x1x0_default,1914.39,1110.03,0.0 -256,16384,8192,3584,torch.float8_e4m3fn,flydsl,825,0,427.2274,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2251.9,834.49,0.0 -256,16384,8192,7168,torch.float8_e4m3fn,flydsl,825,0,768.6011,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2503.44,578.45,0.0 -256,16384,8192,8192,torch.float8_e4m3fn,flydsl,979,0,871.056,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2524.55,539.3,0.0 -256,16384,8192,28672,torch.float8_e4m3fn,cktile,154,0,2614.7771,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2943.49,372.15,0.0 -256,32768,8192,1024,torch.float8_e4m3fn,ck,51,0,367.838,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1494.56,1573.56,0.0 -256,32768,8192,2048,torch.float8_e4m3fn,ck,143,0,576.9388,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1905.77,1075.95,0.0 -256,32768,8192,3584,torch.float8_e4m3fn,flydsl,825,0,864.7149,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2225.18,790.63,0.0 -256,32768,8192,7168,torch.float8_e4m3fn,ck,33,0,1526.85,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2520.41,543.91,0.0 -256,32768,8192,8192,torch.float8_e4m3fn,cktile,115,0,1726.1188,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2547.94,505.42,0.0 -256,32768,8192,28672,torch.float8_e4m3fn,cktile,10,0,5232.9311,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2941.59,327.02,0.0 -256,1,9216,16384,torch.float8_e4m3fn,flydsl,1159,0,28.55,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x3_default,10.58,5290.01,0.0 -256,16,9216,16384,torch.float8_e4m3fn,ck,5,0,30.546,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,158.18,4961.44,0.0 -256,32,9216,16384,torch.float8_e4m3fn,flydsl,857,0,34.613,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x1x0x2_default,279.19,4394.56,0.0 -256,64,9216,16384,torch.float8_e4m3fn,flydsl,624,0,45.6522,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x1x1x1_default,423.36,3356.32,0.0 -256,128,9216,16384,torch.float8_e4m3fn,cktile,75,0,53.8184,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_48x128x256_1x4x1_16x16x128_default,718.24,2888.44,0.0 -256,256,9216,16384,torch.float8_e4m3fn,cktile,132,0,66.0848,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x128x256_1x4x1_16x16x128_default,1169.85,2419.74,0.0 -256,512,9216,16384,torch.float8_e4m3fn,ck,149,0,92.2759,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1675.61,1829.52,0.0 -256,1024,9216,16384,torch.float8_e4m3fn,cktile,233,0,142.3588,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_224x192x128_1x4x1_16x16x128_default,2172.24,1311.1,0.0 -256,2048,9216,16384,torch.float8_e4m3fn,cktile,55,0,265.8551,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x192x128_1x4x1_16x16x128_default,2326.36,836.16,0.0 -256,4096,9216,16384,torch.float8_e4m3fn,cktile,121,0,464.1345,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x128_default,2665.07,632.58,0.0 -256,8192,9216,16384,torch.float8_e4m3fn,cktile,235,0,919.8305,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x192x128_1x4x1_16x16x128_default,2689.52,474.23,0.0 -256,16384,9216,16384,torch.float8_e4m3fn,cktile,10,0,1743.5349,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2837.8,413.77,0.0 -256,32768,9216,16384,torch.float8_e4m3fn,cktile,154,0,3492.6909,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2833.23,369.87,0.0 -256,1,10240,8192,torch.float8_e4m3fn,flydsl,929,0,17.3869,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x1x2_default,9.65,4826.32,0.0 -256,16,10240,8192,torch.float8_e4m3fn,ck,10,0,18.0502,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,148.72,4672.79,0.0 -256,32,10240,8192,torch.float8_e4m3fn,cktile,37,0,20.9702,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,256.02,4044.0,0.0 -256,64,10240,8192,torch.float8_e4m3fn,ck,114,0,25.7553,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,416.9,3328.29,0.0 -256,128,10240,8192,torch.float8_e4m3fn,ck,120,0,31.1068,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,690.36,2814.69,0.0 -256,256,10240,8192,torch.float8_e4m3fn,cktile,253,0,39.3727,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_96x128x256_1x4x1_16x16x128_default,1090.85,2316.99,0.0 -256,512,10240,8192,torch.float8_e4m3fn,ck,143,0,56.8519,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1510.93,1733.74,0.0 -256,1024,10240,8192,torch.float8_e4m3fn,ck,51,0,95.1718,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1805.14,1189.91,0.0 -256,2048,10240,8192,torch.float8_e4m3fn,flydsl,1280,0,168.0196,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x1x3_default,2044.98,848.75,0.0 -256,4096,10240,8192,torch.float8_e4m3fn,flydsl,979,0,280.4078,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2450.7,717.98,0.0 -256,8192,10240,8192,torch.float8_e4m3fn,flydsl,979,0,533.7477,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2574.98,597.22,0.0 -256,16384,10240,8192,torch.float8_e4m3fn,flydsl,979,0,1059.2396,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2595.05,522.68,0.0 -256,32768,10240,8192,torch.float8_e4m3fn,ck,154,0,2180.7525,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2520.95,469.29,0.0 -256,1,12800,5120,torch.float8_e4m3fn,flydsl,22,0,12.9565,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x0x0_default,10.12,5060.53,0.0 -256,16,12800,5120,torch.float8_e4m3fn,flydsl,848,0,13.1569,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x2_default,159.4,5018.47,0.0 -256,32,12800,5120,torch.float8_e4m3fn,flydsl,1018,0,14.4111,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x3_default,291.05,4615.82,0.0 -256,64,12800,5120,torch.float8_e4m3fn,flydsl,541,0,18.6039,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x1x0x1_default,450.91,3628.38,0.0 -256,128,12800,5120,torch.float8_e4m3fn,ck,65,0,22.6804,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,739.72,3062.92,0.0 -256,256,12800,5120,torch.float8_e4m3fn,flydsl,152,0,31.0997,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x0_default,1078.93,2360.16,0.0 -256,512,12800,5120,torch.float8_e4m3fn,ck,138,0,46.4049,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1446.16,1751.21,0.0 -256,1024,12800,5120,torch.float8_e4m3fn,cktile,130,0,80.2052,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_224x128x128_1x4x1_16x16x128_default,1673.43,1209.31,0.0 -256,2048,12800,5120,torch.float8_e4m3fn,ck,51,0,146.6395,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1830.58,875.96,0.0 -256,4096,12800,5120,torch.float8_e4m3fn,flydsl,825,0,249.1087,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2155.17,768.2,0.0 -256,8192,12800,5120,torch.float8_e4m3fn,flydsl,825,0,456.4241,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2352.51,694.96,0.0 -256,16384,12800,5120,torch.float8_e4m3fn,flydsl,825,0,882.1332,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2434.42,644.86,0.0 -256,32768,12800,5120,torch.float8_e4m3fn,cktile,115,0,1784.2978,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2407.09,600.89,0.0 -256,1,13312,16384,torch.float8_e4m3fn,flydsl,686,0,39.3036,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x0x2_default,11.1,5550.3,0.0 -256,16,13312,16384,torch.float8_e4m3fn,ck,25,0,39.8372,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,175.2,5492.15,0.0 -256,32,13312,16384,torch.float8_e4m3fn,flydsl,529,0,40.8319,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x1x0x1_default,341.86,5375.21,0.0 -256,64,13312,16384,torch.float8_e4m3fn,ck,114,0,50.5591,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,552.17,4368.28,0.0 -256,128,13312,16384,torch.float8_e4m3fn,ck,65,0,61.0826,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,914.08,3660.76,0.0 -256,256,13312,16384,torch.float8_e4m3fn,cktile,1,0,78.4817,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x256_1x4x1_16x16x128_default,1422.87,2919.33,0.0 -256,512,13312,16384,torch.float8_e4m3fn,ck,154,0,119.7862,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1864.47,2004.6,0.0 -256,1024,13312,16384,torch.float8_e4m3fn,cktile,10,0,188.278,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2372.43,1392.32,0.0 -256,2048,13312,16384,torch.float8_e4m3fn,cktile,155,0,366.501,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2437.52,835.43,0.0 -256,4096,13312,16384,torch.float8_e4m3fn,flydsl,825,0,704.7777,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2535.13,559.42,0.0 -256,8192,13312,16384,torch.float8_e4m3fn,cktile,10,0,1317.8416,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2711.56,432.85,0.0 -256,16384,13312,16384,torch.float8_e4m3fn,cktile,154,0,2517.5291,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2838.83,366.53,0.0 -256,32768,13312,16384,torch.float8_e4m3fn,cktile,10,0,5030.6118,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2841.33,323.5,0.0 -256,1,14336,8192,torch.float8_e4m3fn,ck,10,0,21.8553,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.75,5375.24,0.0 -256,16,14336,8192,torch.float8_e4m3fn,ck,10,0,21.3952,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,175.65,5516.67,0.0 -256,32,14336,8192,torch.float8_e4m3fn,flydsl,31,0,25.1546,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x0_default,298.8,4715.64,0.0 -256,64,14336,8192,torch.float8_e4m3fn,ck,114,0,29.5305,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,509.05,4056.82,0.0 -256,128,14336,8192,torch.float8_e4m3fn,ck,144,0,37.2517,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,807.07,3279.29,0.0 -256,256,14336,8192,torch.float8_e4m3fn,cktile,29,0,46.2123,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x256_1x4x1_16x16x128_default,1301.16,2745.54,0.0 -256,512,14336,8192,torch.float8_e4m3fn,ck,154,0,65.8134,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1827.27,2071.23,0.0 -256,1024,14336,8192,torch.float8_e4m3fn,cktile,154,0,116.9719,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2056.2,1326.72,0.0 -256,2048,14336,8192,torch.float8_e4m3fn,cktile,115,0,218.7174,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2199.35,882.13,0.0 -256,4096,14336,8192,torch.float8_e4m3fn,flydsl,825,0,386.6129,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2488.47,694.33,0.0 -256,8192,14336,8192,torch.float8_e4m3fn,flydsl,979,0,738.2272,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2606.44,568.16,0.0 -256,16384,14336,8192,torch.float8_e4m3fn,flydsl,825,0,1489.9573,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2582.82,484.19,0.0 -256,32768,14336,8192,torch.float8_e4m3fn,cktile,82,0,2971.0677,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x256x128_1x4x1_16x16x128_default,2590.51,446.1,0.0 -256,1,16384,2048,torch.float8_e4m3fn,flydsl,333,0,7.7788,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x1_default,8.63,4318.05,0.0 -256,1,16384,4096,torch.float8_e4m3fn,flydsl,520,0,14.2993,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x1_default,9.39,4695.74,0.0 -256,1,16384,6656,torch.float8_e4m3fn,flydsl,1159,0,19.6315,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x3_default,11.11,5556.95,0.0 -256,1,16384,8192,torch.float8_e4m3fn,ck,24,0,25.4479,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,10.55,5275.83,0.0 -256,1,16384,13312,torch.float8_e4m3fn,flydsl,745,0,37.3614,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x1x2_default,11.68,5838.91,0.0 -256,1,16384,26624,torch.float8_e4m3fn,cktile,144,0,71.1231,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x256_1x4x1_16x16x128_default,12.27,6133.97,0.0 -256,4,16384,6656,torch.float8_e4m3fn,flydsl,1009,0,19.7247,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x0x3_default,44.23,5536.69,0.0 -256,8,16384,6656,torch.float8_e4m3fn,flydsl,848,0,19.7962,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x2_default,88.14,5524.66,0.0 -256,16,16384,2048,torch.float8_e4m3fn,flydsl,333,0,7.7788,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x1_default,138.03,4385.19,0.0 -256,16,16384,4096,torch.float8_e4m3fn,ck,10,0,14.0016,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,153.37,4835.07,0.0 -256,16,16384,6656,torch.float8_e4m3fn,flydsl,686,0,19.8218,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x0x2_default,176.05,5533.44,0.0 -256,16,16384,8192,torch.float8_e4m3fn,ck,10,0,24.0843,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,178.33,5600.04,0.0 -256,16,16384,13312,torch.float8_e4m3fn,flydsl,188,0,37.636,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x0_default,185.44,5814.67,0.0 -256,16,16384,26624,torch.float8_e4m3fn,flydsl,354,0,71.2847,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x0x1_default,195.82,6132.56,0.0 -256,32,16384,2048,torch.float8_e4m3fn,ck,12,0,8.8156,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,243.6,3932.64,0.0 -256,32,16384,4096,torch.float8_e4m3fn,flydsl,1018,0,15.3879,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x3_default,279.11,4437.81,0.0 -256,32,16384,6656,torch.float8_e4m3fn,flydsl,197,0,20.6179,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x1x0x0_default,338.51,5350.37,0.0 -256,32,16384,8192,torch.float8_e4m3fn,cktile,9,0,27.6417,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x128_default,310.76,4903.04,0.0 -256,32,16384,13312,torch.float8_e4m3fn,flydsl,363,0,38.3032,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x1_default,364.43,5732.64,0.0 -256,32,16384,26624,torch.float8_e4m3fn,flydsl,363,0,71.7202,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x1_default,389.25,6108.57,0.0 -256,64,16384,2048,torch.float8_e4m3fn,flydsl,869,0,10.0391,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x1x0x2_default,427.82,3564.33,0.0 -256,64,16384,4096,torch.float8_e4m3fn,ck,114,0,19.0594,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,450.69,3644.82,0.0 -256,64,16384,6656,torch.float8_e4m3fn,ck,114,0,24.5126,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,569.45,4551.74,0.0 -256,64,16384,8192,torch.float8_e4m3fn,ck,119,0,31.4037,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,547.07,4357.42,0.0 -256,64,16384,13312,torch.float8_e4m3fn,ck,114,0,44.9751,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,620.73,4915.01,0.0 -256,64,16384,26624,torch.float8_e4m3fn,ck,114,0,83.2724,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,670.51,5283.97,0.0 -256,128,16384,2048,torch.float8_e4m3fn,flydsl,1483,0,12.3136,flydsl_bpreshuflle_128x64x256_F8_F8_B16_2x1x0x4_default,697.6,3086.9,0.0 -256,128,16384,4096,torch.float8_e4m3fn,ck,144,0,23.4825,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,731.6,3058.77,0.0 -256,128,16384,6656,torch.float8_e4m3fn,ck,144,0,31.2393,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,893.66,3652.39,0.0 -256,128,16384,8192,torch.float8_e4m3fn,cktile,24,0,38.2204,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x128_default,898.99,3648.85,0.0 -256,128,16384,13312,torch.float8_e4m3fn,cktile,40,0,55.2899,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x64x256_1x4x1_16x16x128_default,1009.85,4051.41,0.0 -256,128,16384,26624,torch.float8_e4m3fn,cktile,158,0,103.2711,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_128x64x256_1x4x1_16x16x128_default,1081.32,4297.52,0.0 -256,256,16384,2048,torch.float8_e4m3fn,flydsl,1130,0,16.7273,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x3_default,1027.06,2538.8,0.0 -256,256,16384,4096,torch.float8_e4m3fn,ck,139,0,28.7093,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1196.82,2666.25,0.0 -256,256,16384,6656,torch.float8_e4m3fn,cktile,29,0,41.8226,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x256_1x4x1_16x16x128_default,1335.03,2848.81,0.0 -256,256,16384,8192,torch.float8_e4m3fn,cktile,1,0,48.6422,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x256_1x4x1_16x16x128_default,1412.75,2974.85,0.0 -256,256,16384,13312,torch.float8_e4m3fn,cktile,135,0,71.204,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_128x128x256_1x4x1_16x16x128_default,1568.3,3228.76,0.0 -256,256,16384,26624,torch.float8_e4m3fn,cktile,29,0,131.8695,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x256_1x4x1_16x16x128_default,1693.63,3423.17,0.0 -256,512,16384,2048,torch.float8_e4m3fn,flydsl,495,0,25.8961,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x1_default,1326.83,1984.09,0.0 -256,512,16384,4096,torch.float8_e4m3fn,ck,154,0,43.5445,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1578.14,1974.61,0.0 -256,512,16384,6656,torch.float8_e4m3fn,ck,154,0,60.6062,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1842.54,2132.41,0.0 -256,512,16384,8192,torch.float8_e4m3fn,ck,154,0,69.0175,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1991.36,2248.55,0.0 -256,512,16384,13312,torch.float8_e4m3fn,ck,154,0,106.2418,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2102.17,2274.97,0.0 -256,512,16384,26624,torch.float8_e4m3fn,ck,154,0,195.6027,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2283.59,2385.53,0.0 -256,1024,16384,2048,torch.float8_e4m3fn,flydsl,927,0,47.073,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x2_default,1459.85,1470.18,0.0 -256,1024,16384,4096,torch.float8_e4m3fn,ck,107,0,77.2916,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1778.19,1356.65,0.0 -256,1024,16384,6656,torch.float8_e4m3fn,ck,33,0,104.7702,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2131.7,1426.19,0.0 -256,1024,16384,8192,torch.float8_e4m3fn,cktile,154,0,121.4869,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2262.61,1450.04,0.0 -256,1024,16384,13312,torch.float8_e4m3fn,cktile,155,0,176.2123,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2534.88,1505.51,0.0 -256,1024,16384,26624,torch.float8_e4m3fn,cktile,155,0,314.8139,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2837.72,1578.79,0.0 -256,2048,16384,2048,torch.float8_e4m3fn,flydsl,601,0,80.3525,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x1_default,1710.45,1304.97,0.0 -256,2048,16384,4096,torch.float8_e4m3fn,flydsl,979,0,134.1836,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2048.52,1062.77,0.0 -256,2048,16384,6656,torch.float8_e4m3fn,ck,33,0,194.6083,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2295.26,975.25,0.0 -256,2048,16384,8192,torch.float8_e4m3fn,ck,33,0,230.0845,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2389.36,947.93,0.0 -256,2048,16384,13312,torch.float8_e4m3fn,cktile,155,0,339.3213,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2632.76,920.88,0.0 -256,2048,16384,26624,torch.float8_e4m3fn,cktile,154,0,604.2855,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2956.73,923.14,0.0 -256,4096,16384,2048,torch.float8_e4m3fn,flydsl,601,0,147.6909,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x1_default,1861.17,1192.77,0.0 -256,4096,16384,4096,torch.float8_e4m3fn,flydsl,979,0,241.7976,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2273.62,902.01,0.0 -256,4096,16384,6656,torch.float8_e4m3fn,flydsl,825,0,365.2692,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2445.74,740.64,0.0 -256,4096,16384,8192,torch.float8_e4m3fn,flydsl,979,0,432.8399,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2540.23,697.69,0.0 -256,4096,16384,13312,torch.float8_e4m3fn,cktile,155,0,665.018,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2686.7,611.78,0.0 -256,4096,16384,26624,torch.float8_e4m3fn,cktile,155,0,1192.4676,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2996.65,569.81,0.0 -256,8192,16384,2048,torch.float8_e4m3fn,flydsl,979,0,292.9922,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,1876.35,1087.97,0.0 -256,8192,16384,4096,torch.float8_e4m3fn,flydsl,979,0,467.2444,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2353.18,789.95,0.0 -256,8192,16384,6656,torch.float8_e4m3fn,flydsl,825,0,720.0273,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2481.44,600.0,0.0 -256,8192,16384,8192,torch.float8_e4m3fn,flydsl,825,0,855.8274,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2569.47,548.9,0.0 -256,8192,16384,13312,torch.float8_e4m3fn,cktile,155,0,1317.2113,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2712.86,452.16,0.0 -256,8192,16384,26624,torch.float8_e4m3fn,cktile,10,0,2446.191,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2921.61,377.22,0.0 -256,16384,16384,2048,torch.float8_e4m3fn,flydsl,979,0,573.0956,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,1918.55,1053.89,0.0 -256,16384,16384,4096,torch.float8_e4m3fn,flydsl,979,0,945.5081,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2325.76,709.77,0.0 -256,16384,16384,6656,torch.float8_e4m3fn,flydsl,825,0,1434.0728,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2491.79,526.45,0.0 -256,16384,16384,8192,torch.float8_e4m3fn,cktile,115,0,1706.7491,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2576.86,471.84,0.0 -256,16384,16384,13312,torch.float8_e4m3fn,cktile,10,0,2589.7115,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2759.7,375.75,0.0 -256,16384,16384,26624,torch.float8_e4m3fn,cktile,155,0,4846.7561,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2949.12,290.77,0.0 -256,32768,16384,2048,torch.float8_e4m3fn,ck,143,0,1121.2198,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1961.28,1047.44,0.0 -256,32768,16384,4096,torch.float8_e4m3fn,flydsl,979,0,1893.374,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2322.86,673.44,0.0 -256,32768,16384,6656,torch.float8_e4m3fn,cktile,115,0,2831.5792,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2523.97,494.74,0.0 -256,32768,16384,8192,torch.float8_e4m3fn,cktile,115,0,3423.2243,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2569.53,431.29,0.0 -256,32768,16384,13312,torch.float8_e4m3fn,cktile,10,0,5107.7843,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2798.41,338.32,0.0 -256,32768,16384,26624,torch.float8_e4m3fn,cktile,155,0,9941.6576,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2875.51,239.63,0.0 -256,1,26624,16384,torch.float8_e4m3fn,flydsl,1138,0,75.2625,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x1x0x3_default,11.59,5796.74,0.0 -256,16,26624,16384,torch.float8_e4m3fn,flydsl,0,0,75.7716,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x0x0_default,184.22,5771.58,0.0 -256,32,26624,16384,torch.float8_e4m3fn,flydsl,1317,0,77.5854,flydsl_bpreshuflle_32x64x256_F8_F8_B16_2x0x0x4_default,359.83,5651.01,0.0 -256,64,26624,16384,torch.float8_e4m3fn,flydsl,377,0,79.7028,flydsl_bpreshuflle_64x128x256_F8_F8_B16_2x0x0x1_default,700.53,5528.84,0.0 -256,128,26624,16384,torch.float8_e4m3fn,flydsl,1418,0,97.1275,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x4_default,1149.72,4582.85,0.0 -256,256,26624,16384,torch.float8_e4m3fn,ck,154,0,131.4447,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1699.1,3454.18,0.0 -256,512,26624,16384,torch.float8_e4m3fn,cktile,155,0,195.1564,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2288.81,2417.85,0.0 -256,1024,26624,16384,torch.float8_e4m3fn,cktile,154,0,369.4329,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2418.17,1373.76,0.0 -256,2048,26624,16384,torch.float8_e4m3fn,cktile,13,0,731.002,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_256x128x128_1x4x1_16x16x128_default,2444.19,791.81,0.0 -256,4096,26624,16384,torch.float8_e4m3fn,cktile,10,0,1328.8625,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2689.08,542.89,0.0 -256,8192,26624,16384,torch.float8_e4m3fn,cktile,155,0,2500.8282,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2857.78,402.52,0.0 -256,16384,26624,16384,torch.float8_e4m3fn,cktile,10,0,4993.8298,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2862.26,315.8,0.0 -304,64,1536,5120,torch.int8,asm,0,1,16.7759,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,60.0,500.04,0.0 -304,64,5120,1280,torch.int8,asm,0,1,7.7424,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,108.35,941.68,0.0 -304,128,1536,5120,torch.int8,asm,0,1,17.9602,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,112.1,496.26,0.0 -304,128,5120,1280,torch.int8,asm,1,1,8.5797,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,195.55,935.72,0.0 -304,256,1536,5120,torch.int8,asm,0,1,17.6791,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,227.76,563.46,0.0 -304,256,5120,1280,torch.int8,asm,2,1,9.2104,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_48x128E,364.31,1031.74,0.0 -304,512,1536,5120,torch.int8,asm,1,1,18.7067,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,430.49,644.62,0.0 -304,512,5120,1280,torch.int8,asm,4,1,11.2508,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,596.48,1106.75,0.0 -304,1024,1536,5120,torch.int8,asm,2,1,21.0125,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_48x128E,766.5,773.49,0.0 -304,1024,5120,1280,torch.int8,asm,6,1,18.2637,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,734.89,1004.73,0.0 -304,1664,1536,5120,torch.int8,asm,4,1,27.6261,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,947.38,778.1,0.0 -304,1664,5120,1280,torch.int8,asm,6,1,21.5241,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1013.3,1195.07,0.0 -304,4096,1536,5120,torch.int8,asm,6,1,63.8361,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1009.22,648.83,0.0 -304,4096,5120,1280,torch.int8,asm,6,1,58.5804,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,916.47,917.36,0.0 -304,8192,1536,5120,torch.int8,asm,6,1,128.8567,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,999.94,581.83,0.0 -304,8192,5120,1280,torch.int8,asm,6,1,105.657,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1016.25,955.22,0.0 -304,10240,1536,5120,torch.int8,asm,6,1,142.1441,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1133.08,645.47,0.0 -304,10240,5120,1280,torch.int8,asm,6,1,133.9571,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1001.95,929.54,0.0 -304,12288,1536,5120,torch.int8,asm,7,1,185.0056,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,1044.69,586.62,0.0 -304,12288,5120,1280,torch.int8,asm,6,1,153.8919,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1046.59,962.44,0.0 -304,16384,1536,5120,torch.int8,asm,6,1,211.7054,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1217.25,671.13,0.0 -304,16384,5120,1280,torch.int8,asm,6,1,195.4489,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1098.74,999.22,0.0 -304,20480,1536,5120,torch.int8,asm,6,1,271.7016,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1185.57,646.43,0.0 -304,20480,5120,1280,torch.int8,asm,6,1,245.7639,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1092.25,986.65,0.0 -304,24576,1536,5120,torch.int8,asm,6,1,328.3654,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1177.19,637.07,0.0 -304,24576,5120,1280,torch.int8,asm,6,1,290.5831,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1108.54,996.85,0.0 -304,30720,1536,5120,torch.int8,asm,6,1,402.8387,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1199.45,644.23,0.0 -304,30720,5120,1280,torch.int8,asm,6,1,361.4235,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1114.08,997.3,0.0 -304,32768,1536,5120,torch.int8,asm,6,1,406.391,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1268.23,679.89,0.0 -304,32768,5120,1280,torch.int8,asm,6,1,379.1231,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1132.87,1012.97,0.0 -304,40960,1536,5120,torch.int8,asm,6,1,525.9013,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1225.03,652.99,0.0 -304,40960,5120,1280,torch.int8,asm,6,1,475.7676,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1128.43,1005.56,0.0 -256,32768,26624,16384,torch.float8_e4m3fn,cktile,10,0,10041.205,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2847.0,270.68,0.0 -256,1,51200,5120,torch.float8_e4m3fn,flydsl,26,0,44.5486,flydsl_bpreshuflle_16x256x512_F8_F8_B16_2x0x0x0_default,11.77,5886.86,0.0 -256,16,51200,5120,torch.float8_e4m3fn,flydsl,26,0,45.3929,flydsl_bpreshuflle_16x256x512_F8_F8_B16_2x0x0x0_default,184.8,5812.9,0.0 -256,32,51200,5120,torch.float8_e4m3fn,flydsl,918,0,47.6289,flydsl_bpreshuflle_32x128x256_F8_F8_B16_1x1x1x2_default,352.25,5576.12,0.0 -256,64,51200,5120,torch.float8_e4m3fn,flydsl,463,0,50.7771,flydsl_bpreshuflle_64x256x128_F8_F8_B16_2x0x1x1_default,660.82,5298.16,0.0 -256,128,51200,5120,torch.float8_e4m3fn,flydsl,155,0,60.4343,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x0x1x0_default,1110.44,4565.4,0.0 -256,256,51200,5120,torch.float8_e4m3fn,flydsl,825,0,88.2861,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,1520.26,3281.03,0.0 -256,512,51200,5120,torch.float8_e4m3fn,flydsl,825,0,154.5155,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,1737.27,2052.83,0.0 -256,1024,51200,5120,torch.float8_e4m3fn,flydsl,825,0,258.4335,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2077.4,1440.39,0.0 -256,2048,51200,5120,torch.float8_e4m3fn,flydsl,825,0,484.2625,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2217.27,996.04,0.0 -256,4096,51200,5120,torch.float8_e4m3fn,flydsl,825,0,896.4303,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2395.59,783.72,0.0 -256,8192,51200,5120,torch.float8_e4m3fn,flydsl,825,0,1732.5641,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2478.97,659.69,0.0 -256,16384,51200,5120,torch.float8_e4m3fn,flydsl,825,0,3450.4561,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2489.51,586.52,0.0 -256,32768,51200,5120,torch.float8_e4m3fn,flydsl,825,0,7359.6237,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2334.34,514.34,0.0 -256,1,53248,16384,torch.float8_e4m3fn,ck,7,0,149.1782,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,11.7,5848.97,0.0 -256,16,53248,16384,torch.float8_e4m3fn,ck,7,0,149.5025,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,186.73,5848.61,0.0 -256,32,53248,16384,torch.float8_e4m3fn,flydsl,1023,0,149.5234,flydsl_bpreshuflle_32x256x128_F8_F8_B16_2x0x0x3_default,373.42,5860.94,0.0 -256,64,53248,16384,torch.float8_e4m3fn,flydsl,48,0,152.6856,flydsl_bpreshuflle_64x256x128_F8_F8_B16_2x0x0x0_default,731.37,5765.31,0.0 -256,128,53248,16384,torch.float8_e4m3fn,flydsl,155,0,169.9609,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x0x1x0_default,1314.06,5225.58,0.0 -256,256,53248,16384,torch.float8_e4m3fn,cktile,154,0,225.5251,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,1980.61,4007.86,0.0 -256,512,53248,16384,torch.float8_e4m3fn,cktile,155,0,382.9747,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2332.67,2442.28,0.0 -256,1024,53248,16384,torch.float8_e4m3fn,cktile,13,0,751.8658,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_256x128x128_1x4x1_16x16x128_default,2376.36,1327.69,0.0 -256,2048,53248,16384,torch.float8_e4m3fn,cktile,10,0,1326.5153,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2693.83,847.39,0.0 -256,4096,53248,16384,torch.float8_e4m3fn,cktile,155,0,2515.1053,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2841.56,546.99,0.0 -256,8192,53248,16384,torch.float8_e4m3fn,cktile,10,0,4995.5375,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2861.28,376.15,0.0 -256,16384,53248,16384,torch.float8_e4m3fn,cktile,10,0,10019.6764,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2853.12,288.0,0.0 -256,32768,53248,16384,torch.float8_e4m3fn,ck,107,0,20797.6873,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2749.08,235.55,0.0 -256,1,57344,8192,torch.float8_e4m3fn,flydsl,254,0,77.9504,flydsl_bpreshuflle_16x256x512_F8_F8_B16_1x1x1x0_default,12.05,6028.0,0.0 -256,16,57344,8192,torch.float8_e4m3fn,flydsl,88,0,78.2705,flydsl_bpreshuflle_16x256x512_F8_F8_B16_1x0x1x0_default,192.06,6026.9,0.0 -256,32,57344,8192,torch.float8_e4m3fn,flydsl,1499,0,81.879,flydsl_bpreshuflle_32x128x128_F8_F8_B16_1x1x1x4_default,367.19,5785.3,0.0 -256,64,57344,8192,torch.float8_e4m3fn,flydsl,381,0,83.2587,flydsl_bpreshuflle_64x256x256_F8_F8_B16_2x0x0x1_default,722.2,5736.65,0.0 -256,128,57344,8192,torch.float8_e4m3fn,flydsl,72,0,93.9295,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x0x0x0_default,1280.31,5168.67,0.0 -256,256,57344,8192,torch.float8_e4m3fn,ck,33,0,134.0184,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1794.67,3739.93,0.0 -256,512,57344,8192,torch.float8_e4m3fn,ck,33,0,225.9295,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2129.14,2357.71,0.0 -256,1024,57344,8192,torch.float8_e4m3fn,flydsl,825,0,419.9967,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2290.67,1418.09,0.0 -256,2048,57344,8192,torch.float8_e4m3fn,ck,33,0,782.0667,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2460.33,922.45,0.0 -256,4096,57344,8192,torch.float8_e4m3fn,flydsl,979,0,1486.3101,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2589.16,654.69,0.0 -256,8192,57344,8192,torch.float8_e4m3fn,flydsl,825,0,2916.8532,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2638.66,506.16,0.0 -256,16384,57344,8192,torch.float8_e4m3fn,flydsl,825,0,5899.997,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2609.01,420.85,0.0 -256,32768,57344,8192,torch.float8_e4m3fn,ck,33,0,12218.2137,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2519.71,368.0,0.0 +gfx,cu_num,M,N,K,q_dtype_w,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx942,80,128,128,5120,torch.int8,asm,0,8,7.3189,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,22.92,183.56,0.182 +gfx942,80,192,128,5120,torch.int8,asm,1,8,7.5835,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,33.18,222.53,0.1825 +gfx942,80,128,1280,8192,torch.int8,asm,3,4,14.7532,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,181.95,804.03,0.1279 +gfx942,80,192,1280,1024,torch.int8,asm,1,1,6.7934,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,74.09,294.23,0.0 +gfx942,80,192,1280,5120,torch.int8,asm,5,4,14.7099,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_96x128E,171.08,545.77,0.1266 +gfx942,80,192,1280,8192,torch.int8,asm,5,4,19.1633,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_96x128E,210.12,654.91,0.1288 +gfx942,80,256,1280,8192,torch.int8,asm,3,2,22.3885,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,239.8,591.3,0.0662 +gfx942,80,320,1280,8192,torch.int8,asm,4,2,25.8821,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,259.29,538.07,0.0666 +gfx942,80,512,1280,8192,torch.int8,asm,3,1,34.2419,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,313.58,466.99,0.0 +gfx942,80,1024,1280,8192,torch.int8,asm,7,1,60.6271,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,354.21,354.56,0.0 +gfx942,80,2048,1280,8192,torch.int8,asm,7,1,118.4251,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,362.67,274.48,0.0 +gfx942,80,4096,1280,8192,torch.int8,asm,7,1,233.2788,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,368.23,233.74,0.0 +gfx942,80,8192,1280,8192,torch.int8,asm,6,1,453.0835,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,379.18,217.55,0.0 +gfx942,80,16384,1280,8192,torch.int8,asm,7,1,926.2119,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,370.97,201.52,0.0 +gfx942,80,64,1536,5120,torch.int8,asm,3,6,10.6691,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,94.35,786.25,0.1567 +gfx942,80,80,1536,5120,torch.int8,asm,4,6,11.6667,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,107.85,730.26,0.1588 +gfx942,80,128,1536,5120,torch.int8,asm,3,3,14.8921,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,135.19,598.5,0.1017 +gfx942,80,150,1536,5120,torch.int8,asm,4,3,15.3652,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,153.55,591.8,0.1017 +gfx942,80,192,1536,1024,torch.int8,asm,1,1,6.9538,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,86.86,339.28,0.0 +gfx942,80,192,1536,5120,torch.int8,asm,3,2,17.2532,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,175.03,546.98,0.0645 +gfx942,80,220,1536,5120,torch.int8,asm,4,2,18.9054,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,183.03,511.31,0.0653 +gfx942,80,256,1536,5120,torch.int8,asm,5,2,20.7976,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_96x128E,193.61,478.97,0.0645 +gfx942,80,384,1536,5120,torch.int8,asm,3,1,24.6804,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,244.72,446.1,0.0 +gfx942,80,448,1536,5120,torch.int8,asm,4,1,27.7779,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,253.67,415.23,0.0 +gfx942,80,512,1536,5120,torch.int8,asm,5,1,32.3204,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_96x128E,249.16,373.1,0.0 +gfx942,80,128,8192,1024,torch.int8,asm,7,1,15.3831,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,139.6,690.16,0.0 +gfx942,80,192,8192,1024,torch.int8,asm,6,1,18.2548,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,176.46,642.62,0.0 +gfx942,80,192,8192,5120,torch.int8,asm,6,1,60.0549,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,268.19,767.16,0.0 +gfx942,80,256,8192,1024,torch.int8,asm,7,1,28.1363,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,152.65,456.53,0.0 +gfx942,80,320,8192,1024,torch.int8,asm,3,1,32.0764,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,167.37,435.18,0.0 +gfx942,80,512,8192,1024,torch.int8,asm,6,1,33.8085,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,254.08,511.75,0.0 +gfx942,80,1024,8192,1024,torch.int8,asm,6,1,64.2316,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,267.47,408.12,0.0 +gfx942,80,2048,8192,1024,torch.int8,asm,6,1,123.9609,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,277.18,355.27,0.0 +gfx942,80,4096,8192,1024,torch.int8,asm,6,1,232.1688,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,295.99,343.25,0.0 +gfx942,80,8192,8192,1024,torch.int8,asm,6,1,459.697,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,298.98,328.47,0.0 +gfx942,80,16384,8192,1024,torch.int8,asm,6,1,900.9508,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,305.1,325.88,0.0 +gfx942,80,192,1536,5120,torch.float8_e4m3fnuz,cktile,9,0,18.9229,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,159.59,498.72,0.0 +gfx950,256,64,192,1024,torch.float8_e4m3fn,flydsl,989,0,3.199,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x3_default,7.87,89.63,0.0 +gfx950,256,32,384,7168,torch.float8_e4m3fn,ck,10,0,10.6799,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,16.49,281.51,0.0 +gfx950,256,64,384,7168,torch.float8_e4m3fn,ck,10,0,10.0171,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,35.17,325.49,0.0 +gfx950,256,96,384,7168,torch.float8_e4m3fn,ck,8,0,10.3046,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,51.29,341.05,0.0 +gfx950,256,256,384,7168,torch.float8_e4m3fn,ck,10,0,9.4593,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,148.98,505.76,0.0 +gfx950,256,512,384,7168,torch.float8_e4m3fn,flydsl,767,0,11.6866,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x1x2_default,241.18,583.21,0.0 +gfx950,256,1024,384,7168,torch.float8_e4m3fn,flydsl,776,0,13.3134,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x1x2_default,423.42,817.14,0.0 +gfx950,256,2048,384,7168,torch.float8_e4m3fn,cktile,152,0,18.6948,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,603.07,1016.62,0.0 +gfx950,256,4096,384,7168,torch.float8_e4m3fn,flydsl,458,0,25.4051,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x0x1x1_default,887.56,1387.85,0.0 +gfx950,256,8192,384,7168,torch.float8_e4m3fn,flydsl,792,0,34.7994,flydsl_bpreshuflle_64x192x256_F8_F8_B16_2x0x1x2_default,1295.92,1947.28,0.0 +gfx950,256,16384,384,7168,torch.float8_e4m3fn,ck,149,0,54.0722,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1668.03,2455.53,0.0 +gfx950,256,1,800,5120,torch.float8_e4m3fn,ck,10,0,8.6917,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.94,472.03,0.0 +gfx950,256,16,800,5120,torch.float8_e4m3fn,ck,10,0,8.2944,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,15.8,506.79,0.0 +gfx950,256,32,800,5120,torch.float8_e4m3fn,ck,8,0,8.5999,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,30.48,501.29,0.0 +gfx950,256,64,800,5120,torch.float8_e4m3fn,ck,10,0,8.3239,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,62.99,543.75,0.0 +gfx950,256,128,800,5120,torch.float8_e4m3fn,ck,24,0,8.2591,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,126.96,600.08,0.0 +gfx950,256,256,800,5120,torch.float8_e4m3fn,ck,10,0,9.7206,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,215.74,598.35,0.0 +gfx950,256,512,800,5120,torch.float8_e4m3fn,ck,10,0,14.8326,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,282.78,508.11,0.0 +gfx950,256,1024,800,5120,torch.float8_e4m3fn,ck,10,0,22.0756,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,379.99,497.26,0.0 +gfx950,256,2048,800,5120,torch.float8_e4m3fn,ck,10,0,39.1074,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,429.0,456.65,0.0 +gfx950,256,4096,800,5120,torch.float8_e4m3fn,ck,69,0,42.7898,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,784.17,738.99,0.0 +gfx950,256,8192,800,5120,torch.float8_e4m3fn,ck,69,0,55.2653,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1214.3,1070.22,0.0 +gfx950,256,16384,800,5120,torch.float8_e4m3fn,ck,69,0,96.4525,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1391.54,1183.97,0.0 +gfx950,256,32768,800,5120,torch.float8_e4m3fn,ck,57,0,170.0227,a8w8_bpreshuffle_256x160x160x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_4x4x1_1x1_intrawave_v3,1578.82,1319.22,0.0 +gfx950,256,96,1024,7168,torch.float8_e4m3fn,ck,10,0,10.9919,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,128.21,748.26,0.0 +gfx950,256,128,1024,7168,torch.float8_e4m3fn,ck,8,0,11.0339,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,170.3,772.14,0.0 +gfx950,256,256,1024,7168,torch.float8_e4m3fn,ck,11,0,11.9595,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,314.24,811.01,0.0 +gfx950,256,512,1024,7168,torch.float8_e4m3fn,cktile,9,0,13.7695,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x128_default,545.86,875.75,0.0 +gfx950,256,1024,1024,7168,torch.float8_e4m3fn,ck,114,0,19.6079,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,766.65,855.64,0.0 +gfx950,256,2048,1024,7168,torch.float8_e4m3fn,flydsl,626,0,27.5555,flydsl_bpreshuflle_64x128x256_F8_F8_B16_2x1x1x1_default,1091.06,951.33,0.0 +gfx950,256,4096,1024,7168,torch.float8_e4m3fn,flydsl,1280,0,40.8095,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x1x3_default,1473.42,1104.86,0.0 +gfx950,256,8192,1024,7168,torch.float8_e4m3fn,ck,154,0,62.6397,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1919.85,1322.44,0.0 +gfx950,256,16384,1024,7168,torch.float8_e4m3fn,ck,33,0,110.6249,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2174.18,1431.28,0.0 +gfx950,256,1,1280,8192,torch.float8_e4m3fn,ck,10,0,12.7699,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.64,821.97,0.0 +gfx950,256,16,1280,8192,torch.float8_e4m3fn,ck,10,0,12.5805,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,26.67,847.17,0.0 +gfx950,256,32,1280,8192,torch.float8_e4m3fn,ck,10,0,11.5596,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,58.05,936.87,0.0 +gfx950,256,64,1280,8192,torch.float8_e4m3fn,ck,8,0,11.6259,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,115.45,961.12,0.0 +gfx950,256,128,1280,8192,torch.float8_e4m3fn,ck,11,0,12.3468,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,217.41,960.74,0.0 +gfx950,256,256,1280,8192,torch.float8_e4m3fn,cktile,9,0,14.9227,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x128_default,359.77,887.12,0.0 +gfx950,256,512,1280,8192,torch.float8_e4m3fn,ck,114,0,20.8376,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,515.29,767.4,0.0 +gfx950,256,1024,1280,8192,torch.float8_e4m3fn,cktile,216,0,27.0331,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_48x64x256_1x4x1_16x16x128_default,794.39,795.17,0.0 +gfx950,256,2048,1280,8192,torch.float8_e4m3fn,cktile,99,0,36.1565,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x128x256_1x4x1_16x16x128_default,1187.88,899.03,0.0 +gfx950,256,4096,1280,8192,torch.float8_e4m3fn,ck,138,0,57.904,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1483.48,941.66,0.0 +gfx950,256,8192,1280,8192,torch.float8_e4m3fn,ck,51,0,94.2147,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1823.48,1046.19,0.0 +gfx950,256,16384,1280,8192,torch.float8_e4m3fn,ck,154,0,170.1443,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2019.45,1096.99,0.0 +gfx950,256,32768,1280,8192,torch.float8_e4m3fn,ck,154,0,293.2178,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2343.63,1237.33,0.0 +gfx950,256,1,2304,16384,torch.float8_e4m3fn,cktile,2,0,23.5282,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,3.21,1605.3,0.0 +gfx950,256,16,2304,16384,torch.float8_e4m3fn,ck,10,0,21.4115,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,56.42,1778.7,0.0 +gfx950,256,32,2304,16384,torch.float8_e4m3fn,ck,10,0,21.3018,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,113.41,1803.63,0.0 +gfx950,256,64,2304,16384,torch.float8_e4m3fn,ck,5,0,23.2067,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,208.21,1684.52,0.0 +gfx950,256,128,2304,16384,torch.float8_e4m3fn,cktile,152,0,27.897,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,346.41,1449.46,0.0 +gfx950,256,256,2304,16384,torch.float8_e4m3fn,ck,114,0,39.2871,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,491.95,1097.63,0.0 +gfx950,256,512,2304,16384,torch.float8_e4m3fn,cktile,117,0,48.8917,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_80x64x256_1x4x1_16x16x128_default,790.62,991.92,0.0 +gfx950,256,1024,2304,16384,torch.float8_e4m3fn,cktile,92,0,64.6028,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_80x128x256_1x4x1_16x16x128_default,1196.69,917.06,0.0 +gfx950,256,2048,2304,16384,torch.float8_e4m3fn,cktile,132,0,93.6614,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x128x256_1x4x1_16x16x128_default,1650.83,862.05,0.0 +gfx950,256,4096,2304,16384,torch.float8_e4m3fn,cktile,89,0,140.171,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_224x192x128_1x4x1_16x16x128_default,2206.15,882.72,0.0 +gfx950,256,8192,2304,16384,torch.float8_e4m3fn,cktile,55,0,261.4037,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x192x128_1x4x1_16x16x128_default,2365.98,802.27,0.0 +gfx950,256,16384,2304,16384,torch.float8_e4m3fn,cktile,234,0,474.9339,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x192x128_1x4x1_16x16x128_default,2604.47,803.65,0.0 +gfx950,256,32768,2304,16384,torch.float8_e4m3fn,cktile,234,0,941.4041,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x192x128_1x4x1_16x16x128_default,2627.88,770.78,0.0 +gfx950,256,1,2560,8192,torch.float8_e4m3fn,ck,10,0,12.9563,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,3.24,1619.66,0.0 +gfx950,256,16,2560,8192,torch.float8_e4m3fn,ck,10,0,11.7435,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,57.15,1803.94,0.0 +gfx950,256,32,2560,8192,torch.float8_e4m3fn,ck,10,0,11.955,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,112.27,1789.84,0.0 +gfx950,256,64,2560,8192,torch.float8_e4m3fn,ck,11,0,12.9931,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,206.6,1679.62,0.0 +gfx950,256,128,2560,8192,torch.float8_e4m3fn,cktile,152,0,15.9038,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,337.57,1425.79,0.0 +gfx950,256,256,2560,8192,torch.float8_e4m3fn,ck,114,0,21.9372,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,489.46,1111.33,0.0 +gfx950,256,512,2560,8192,torch.float8_e4m3fn,cktile,217,0,27.7332,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_48x64x256_1x4x1_16x16x128_default,774.34,1001.95,0.0 +gfx950,256,1024,2560,8192,torch.float8_e4m3fn,cktile,253,0,36.6417,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_96x128x256_1x4x1_16x16x128_default,1172.15,944.36,0.0 +gfx950,256,2048,2560,8192,torch.float8_e4m3fn,cktile,119,0,57.5848,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_112x128x256_1x4x1_16x16x128_default,1491.7,837.63,0.0 +gfx950,256,4096,2560,8192,torch.float8_e4m3fn,ck,51,0,93.1071,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1845.17,810.87,0.0 +gfx950,256,8192,2560,8192,torch.float8_e4m3fn,flydsl,976,0,163.6521,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x1x2_default,2099.56,794.51,0.0 +gfx950,256,16384,2560,8192,torch.float8_e4m3fn,flydsl,979,0,284.3899,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2416.38,840.66,0.0 +gfx950,256,32768,2560,8192,torch.float8_e4m3fn,ck,33,0,548.7549,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2504.56,833.12,0.0 +gfx950,256,1,4608,16384,torch.float8_e4m3fn,flydsl,767,0,24.8086,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x1x2_default,6.09,3044.23,0.0 +gfx950,256,16,4608,16384,torch.float8_e4m3fn,ck,10,0,22.6718,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,106.56,3348.08,0.0 +gfx950,256,32,4608,16384,torch.float8_e4m3fn,ck,11,0,25.5291,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,189.27,2989.4,0.0 +gfx950,256,64,4608,16384,torch.float8_e4m3fn,cktile,37,0,30.5476,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,316.35,2525.1,0.0 +gfx950,256,128,4608,16384,torch.float8_e4m3fn,ck,114,0,41.2697,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,468.32,1908.77,0.0 +gfx950,256,256,4608,16384,torch.float8_e4m3fn,cktile,226,0,50.4918,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_96x64x256_1x4x1_16x16x128_default,765.56,1625.04,0.0 +gfx950,256,512,4608,16384,torch.float8_e4m3fn,cktile,92,0,64.2676,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_80x128x256_1x4x1_16x16x128_default,1202.93,1378.68,0.0 +gfx950,256,1024,4608,16384,torch.float8_e4m3fn,ck,149,0,93.1125,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1660.56,1092.35,0.0 +gfx950,256,2048,4608,16384,torch.float8_e4m3fn,cktile,232,0,142.0215,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_224x192x128_1x4x1_16x16x128_default,2177.4,900.75,0.0 +gfx950,256,4096,4608,16384,torch.float8_e4m3fn,flydsl,815,0,259.8623,flydsl_bpreshuflle_128x192x128_F8_F8_B16_2x0x1x2_default,2380.01,694.04,0.0 +gfx950,256,8192,4608,16384,torch.float8_e4m3fn,cktile,235,0,472.9169,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x192x128_1x4x1_16x16x128_default,2615.58,603.09,0.0 +gfx950,256,16384,4608,16384,torch.float8_e4m3fn,cktile,121,0,915.2083,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x128_default,2703.1,540.78,0.0 +gfx950,256,32768,4608,16384,torch.float8_e4m3fn,cktile,10,0,1778.3005,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2782.32,514.18,0.0 +gfx950,256,1,5120,640,torch.float8_e4m3fn,ck,9,0,5.2374,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.25,627.73,0.0 +gfx950,256,1,5120,1280,torch.float8_e4m3fn,flydsl,415,0,5.2116,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x1x1_default,2.52,1259.71,0.0 +gfx950,256,1,5120,3200,torch.float8_e4m3fn,ck,9,0,13.8435,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.37,1184.49,0.0 +gfx950,256,1,5120,5120,torch.float8_e4m3fn,ck,10,0,8.9938,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.83,2916.43,0.0 +gfx950,256,1,5120,6400,torch.float8_e4m3fn,flydsl,1213,0,15.5391,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x1x1x3_default,4.22,2109.82,0.0 +gfx950,256,1,5120,25600,torch.float8_e4m3fn,flydsl,1159,0,36.6584,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x3_default,7.15,3576.47,0.0 +gfx950,256,16,5120,640,torch.float8_e4m3fn,ck,9,0,5.3278,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,19.68,647.71,0.0 +gfx950,256,16,5120,1280,torch.float8_e4m3fn,flydsl,83,0,5.3531,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x1x0_default,39.18,1258.69,0.0 +gfx950,256,16,5120,3200,torch.float8_e4m3fn,ck,9,0,14.0198,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,37.4,1183.97,0.0 +gfx950,256,16,5120,5120,torch.float8_e4m3fn,ck,10,0,9.0428,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,92.77,2926.1,0.0 +gfx950,256,16,5120,6400,torch.float8_e4m3fn,flydsl,415,0,15.5054,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x1x1_default,67.63,2130.5,0.0 +gfx950,256,16,5120,25600,torch.float8_e4m3fn,ck,10,0,36.6833,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,114.34,3588.7,0.0 +gfx950,256,32,5120,640,torch.float8_e4m3fn,flydsl,1316,0,4.9324,flydsl_bpreshuflle_32x64x128_F8_F8_B16_2x0x0x4_default,42.52,734.93,0.0 +gfx950,256,32,5120,1280,torch.float8_e4m3fn,flydsl,664,0,5.4128,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x0x2_default,77.49,1278.86,0.0 +gfx950,256,32,5120,3200,torch.float8_e4m3fn,ck,9,0,14.3633,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,73.0,1170.63,0.0 +gfx950,256,32,5120,5120,torch.float8_e4m3fn,ck,5,0,10.6609,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,157.37,2505.03,0.0 +gfx950,256,32,5120,6400,torch.float8_e4m3fn,flydsl,415,0,15.8728,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x1x1_default,132.12,2097.96,0.0 +gfx950,256,32,5120,25600,torch.float8_e4m3fn,ck,11,0,39.0845,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,214.63,3382.9,0.0 +gfx950,256,64,5120,640,torch.float8_e4m3fn,flydsl,422,0,5.0513,flydsl_bpreshuflle_32x64x128_F8_F8_B16_1x0x1x1_default,83.03,786.55,0.0 +gfx950,256,64,5120,1280,torch.float8_e4m3fn,flydsl,91,0,5.5934,flydsl_bpreshuflle_32x64x256_F8_F8_B16_1x0x1x0_default,149.97,1303.48,0.0 +gfx950,256,64,5120,3200,torch.float8_e4m3fn,ck,9,0,14.7686,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,142.0,1167.62,0.0 +gfx950,256,64,5120,5120,torch.float8_e4m3fn,flydsl,529,0,12.134,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x1x0x1_default,276.53,2241.42,0.0 +gfx950,256,64,5120,6400,torch.float8_e4m3fn,flydsl,506,0,17.9065,flydsl_bpreshuflle_32x64x256_F8_F8_B16_1x1x0x1_default,234.23,1889.42,0.0 +gfx950,256,64,5120,25600,torch.float8_e4m3fn,cktile,37,0,46.4046,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,361.54,2873.98,0.0 +gfx950,256,128,5120,640,torch.float8_e4m3fn,flydsl,1365,0,5.5954,flydsl_bpreshuflle_32x128x128_F8_F8_B16_1x0x1x4_default,149.92,834.51,0.0 +gfx950,256,128,5120,1280,torch.float8_e4m3fn,flydsl,541,0,6.5611,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x1x0x1_default,255.71,1223.6,0.0 +gfx950,256,128,5120,3200,torch.float8_e4m3fn,ck,9,0,14.9077,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,281.35,1214.43,0.0 +gfx950,256,128,5120,5120,torch.float8_e4m3fn,ck,114,0,15.9196,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,421.55,1770.18,0.0 +gfx950,256,128,5120,6400,torch.float8_e4m3fn,ck,114,0,18.6977,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,448.64,1866.43,0.0 +gfx950,256,128,5120,25600,torch.float8_e4m3fn,ck,114,0,63.2614,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,530.41,2144.43,0.0 +gfx950,256,256,5120,640,torch.float8_e4m3fn,flydsl,1002,0,6.5154,flydsl_bpreshuflle_64x128x128_F8_F8_B16_1x0x0x3_default,257.5,930.42,0.0 +gfx950,256,256,5120,1280,torch.float8_e4m3fn,ck,113,0,8.4096,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,399.0,1129.98,0.0 +gfx950,256,256,5120,3200,torch.float8_e4m3fn,ck,76,0,16.7712,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,500.18,1182.06,0.0 +gfx950,256,256,5120,5120,torch.float8_e4m3fn,ck,120,0,19.4182,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,691.2,1552.49,0.0 +gfx950,256,256,5120,6400,torch.float8_e4m3fn,ck,120,0,23.2983,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,720.1,1589.29,0.0 +gfx950,256,256,5120,25600,torch.float8_e4m3fn,cktile,227,0,77.1726,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_96x64x256_1x4x1_16x16x128_default,869.59,1817.32,0.0 +gfx950,256,512,5120,640,torch.float8_e4m3fn,flydsl,1179,0,7.8677,flydsl_bpreshuflle_64x64x128_F8_F8_B16_2x1x0x3_default,426.48,1124.52,0.0 +gfx950,256,512,5120,1280,torch.float8_e4m3fn,flydsl,814,0,10.5382,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x2_default,636.82,1181.59,0.0 +gfx950,256,512,5120,3200,torch.float8_e4m3fn,ck,86,0,21.8373,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,768.28,1065.39,0.0 +gfx950,256,512,5120,5120,torch.float8_e4m3fn,ck,123,0,26.3712,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1017.91,1292.27,0.0 +gfx950,256,512,5120,6400,torch.float8_e4m3fn,ck,123,0,30.9752,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1083.27,1332.93,0.0 +gfx950,256,512,5120,25600,torch.float8_e4m3fn,cktile,99,0,96.9889,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x128x256_1x4x1_16x16x128_default,1383.85,1540.61,0.0 +gfx950,256,1024,5120,640,torch.float8_e4m3fn,flydsl,376,0,10.9979,flydsl_bpreshuflle_64x128x128_F8_F8_B16_2x0x0x1_default,610.2,1310.97,0.0 +gfx950,256,1024,5120,1280,torch.float8_e4m3fn,ck,123,0,15.4162,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,870.63,1190.31,0.0 +gfx950,256,1024,5120,3200,torch.float8_e4m3fn,ck,86,0,27.9273,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1201.49,1079.47,0.0 +gfx950,256,1024,5120,5120,torch.float8_e4m3fn,ck,51,0,40.9005,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1312.63,1025.49,0.0 +gfx950,256,1024,5120,6400,torch.float8_e4m3fn,ck,51,0,47.925,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1400.29,1039.28,0.0 +gfx950,256,1024,5120,25600,torch.float8_e4m3fn,cktile,106,0,150.9142,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x256x256_1x4x1_16x16x128_default,1778.73,1111.71,0.0 +gfx950,256,2048,5120,640,torch.float8_e4m3fn,flydsl,151,0,15.6989,flydsl_bpreshuflle_128x128x128_F8_F8_B16_2x0x1x0_default,854.95,1628.08,0.0 +gfx950,256,2048,5120,1280,torch.float8_e4m3fn,flydsl,814,0,22.7992,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x2_default,1177.39,1322.26,0.0 +gfx950,256,2048,5120,3200,torch.float8_e4m3fn,ck,51,0,43.7434,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1534.15,1003.79,0.0 +gfx950,256,2048,5120,5120,torch.float8_e4m3fn,ck,143,0,61.2716,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1752.43,941.25,0.0 +gfx950,256,2048,5120,6400,torch.float8_e4m3fn,ck,143,0,74.2562,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1807.5,900.22,0.0 +gfx950,256,2048,5120,25600,torch.float8_e4m3fn,cktile,10,0,244.7836,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2193.25,835.32,0.0 +gfx950,256,4096,5120,640,torch.float8_e4m3fn,flydsl,267,0,24.8734,flydsl_bpreshuflle_128x128x128_F8_F8_B16_1x1x1x0_default,1079.21,1923.39,0.0 +gfx950,256,4096,5120,1280,torch.float8_e4m3fn,flydsl,1204,0,37.3618,flydsl_bpreshuflle_128x128x128_F8_F8_B16_2x1x0x3_default,1436.95,1438.35,0.0 +gfx950,256,4096,5120,3200,torch.float8_e4m3fn,flydsl,1279,0,75.9837,flydsl_bpreshuflle_128x128x128_F8_F8_B16_2x1x1x3_default,1766.4,940.13,0.0 +gfx950,256,4096,5120,5120,torch.float8_e4m3fn,flydsl,814,0,111.8359,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x2_default,1920.21,796.96,0.0 +gfx950,256,4096,5120,6400,torch.float8_e4m3fn,flydsl,152,0,133.987,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x0_default,2003.44,753.25,0.0 +gfx950,256,4096,5120,25600,torch.float8_e4m3fn,cktile,130,0,453.567,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_224x128x128_1x4x1_16x16x128_default,2367.33,612.64,0.0 +gfx950,256,8192,5120,640,torch.float8_e4m3fn,flydsl,898,0,45.6191,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x0x2_default,1176.86,2025.59,0.0 +gfx950,256,8192,5120,1280,torch.float8_e4m3fn,flydsl,825,0,71.4842,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,1502.07,1411.86,0.0 +gfx950,256,8192,5120,3200,torch.float8_e4m3fn,flydsl,321,0,128.779,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x0_default,2084.47,982.18,0.0 +gfx950,256,8192,5120,5120,torch.float8_e4m3fn,flydsl,825,0,192.9599,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2225.83,787.95,0.0 +gfx950,256,8192,5120,6400,torch.float8_e4m3fn,flydsl,825,0,229.0136,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2344.28,738.31,0.0 +gfx950,256,8192,5120,25600,torch.float8_e4m3fn,cktile,10,0,825.5439,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2601.3,514.42,0.0 +gfx950,256,16384,5120,640,torch.float8_e4m3fn,flydsl,898,0,83.2532,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x0x2_default,1289.73,2180.51,0.0 +gfx950,256,16384,5120,1280,torch.float8_e4m3fn,flydsl,435,0,130.1539,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x0x1x1_default,1649.96,1500.51,0.0 +gfx950,256,16384,5120,3200,torch.float8_e4m3fn,flydsl,321,0,238.792,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x0_default,2248.28,990.76,0.0 +gfx950,256,16384,5120,5120,torch.float8_e4m3fn,flydsl,825,0,371.1527,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2314.39,748.67,0.0 +gfx950,256,16384,5120,6400,torch.float8_e4m3fn,flydsl,825,0,439.5686,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2442.72,694.77,0.0 +gfx950,256,16384,5120,25600,torch.float8_e4m3fn,cktile,10,0,1510.0434,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2844.27,475.66,0.0 +gfx950,256,32768,5120,640,torch.float8_e4m3fn,flydsl,736,0,161.3587,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x0x0x2_default,1330.88,2229.77,0.0 +gfx950,256,32768,5120,1280,torch.float8_e4m3fn,flydsl,103,0,262.1888,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x0x1x0_default,1638.12,1464.75,0.0 +gfx950,256,32768,5120,3200,torch.float8_e4m3fn,flydsl,653,0,489.6873,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x1_default,2192.71,932.81,0.0 +gfx950,256,32768,5120,5120,torch.float8_e4m3fn,ck,33,0,741.3936,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2317.24,714.24,0.0 +gfx950,256,32768,5120,6400,torch.float8_e4m3fn,ck,33,0,875.2866,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2453.46,660.39,0.0 +gfx950,256,32768,5120,25600,torch.float8_e4m3fn,cktile,10,0,2964.8617,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2897.25,440.32,0.0 +gfx950,256,1,6400,5120,torch.float8_e4m3fn,flydsl,1443,0,10.0031,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x4_default,6.55,3277.58,0.0 +gfx950,256,16,6400,5120,torch.float8_e4m3fn,ck,10,0,9.0378,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,116.02,3657.39,0.0 +gfx950,256,32,6400,5120,torch.float8_e4m3fn,cktile,2,0,11.1359,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,188.32,2994.05,0.0 +gfx950,256,64,6400,5120,torch.float8_e4m3fn,cktile,152,0,12.8013,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,327.65,2649.33,0.0 +gfx950,256,128,6400,5120,torch.float8_e4m3fn,ck,114,0,16.3798,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,512.13,2140.55,0.0 +gfx950,256,256,6400,5120,torch.float8_e4m3fn,ck,65,0,21.7673,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,770.75,1716.13,0.0 +gfx950,256,512,6400,5120,torch.float8_e4m3fn,ck,139,0,29.5786,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1134.42,1418.02,0.0 +gfx950,256,1024,6400,5120,torch.float8_e4m3fn,ck,138,0,45.3094,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1481.12,1128.2,0.0 +gfx950,256,2048,6400,5120,torch.float8_e4m3fn,ck,138,0,81.0869,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1655.23,856.71,0.0 +gfx950,256,4096,6400,5120,torch.float8_e4m3fn,ck,143,0,142.3021,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1886.38,746.08,0.0 +gfx950,256,8192,6400,5120,torch.float8_e4m3fn,flydsl,825,0,248.0387,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2164.46,723.95,0.0 +gfx950,256,16384,6400,5120,torch.float8_e4m3fn,flydsl,825,0,466.6318,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2301.05,699.41,0.0 +gfx950,256,32768,6400,5120,torch.float8_e4m3fn,flydsl,825,0,932.2305,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2303.6,665.04,0.0 +gfx950,256,8,6656,16384,torch.float8_e4m3fn,ck,10,0,26.6859,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,65.38,4095.4,0.0 +gfx950,256,1,7168,8192,torch.float8_e4m3fn,ck,10,0,14.4987,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,8.1,4051.59,0.0 +gfx950,256,16,7168,8192,torch.float8_e4m3fn,ck,24,0,14.8554,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,126.49,3977.05,0.0 +gfx950,256,32,7168,256,torch.float8_e4m3fn,flydsl,1071,0,2.8866,flydsl_bpreshuflle_32x64x256_F8_F8_B16_1x0x1x3_default,40.68,797.46,0.0 +gfx950,256,32,7168,8192,torch.float8_e4m3fn,ck,5,0,15.5595,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,241.53,3820.25,0.0 +gfx950,256,64,7168,256,torch.float8_e4m3fn,flydsl,1071,0,3.0972,flydsl_bpreshuflle_32x64x256_F8_F8_B16_1x0x1x3_default,75.84,894.0,0.0 +gfx950,256,64,7168,8192,torch.float8_e4m3fn,cktile,152,0,18.9201,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,397.26,3179.8,0.0 +gfx950,256,96,7168,256,torch.float8_e4m3fn,flydsl,753,0,3.597,flydsl_bpreshuflle_32x64x256_F8_F8_B16_1x0x1x2_default,97.95,899.59,0.0 +gfx950,256,96,7168,512,torch.float8_e4m3fn,flydsl,1299,0,4.5415,flydsl_bpreshuflle_32x128x256_F8_F8_B16_1x0x0x4_default,155.16,1121.97,0.0 +gfx950,256,128,7168,256,torch.float8_e4m3fn,flydsl,428,0,3.563,flydsl_bpreshuflle_64x64x256_F8_F8_B16_1x0x1x1_default,131.84,1039.23,0.0 +gfx950,256,128,7168,512,torch.float8_e4m3fn,flydsl,758,0,4.5422,flydsl_bpreshuflle_64x64x256_F8_F8_B16_1x0x1x2_default,206.84,1226.4,0.0 +gfx950,256,128,7168,8192,torch.float8_e4m3fn,ck,114,0,24.0436,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,625.21,2562.17,0.0 +gfx950,256,256,7168,256,torch.float8_e4m3fn,flydsl,1368,0,4.5468,flydsl_bpreshuflle_64x64x256_F8_F8_B16_1x0x1x4_default,206.63,1225.16,0.0 +gfx950,256,256,7168,512,torch.float8_e4m3fn,flydsl,1078,0,5.8237,flydsl_bpreshuflle_64x128x256_F8_F8_B16_1x0x1x3_default,322.66,1282.88,0.0 +gfx950,256,256,7168,8192,torch.float8_e4m3fn,ck,144,0,30.8344,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,975.04,2091.41,0.0 +gfx950,256,512,7168,256,torch.float8_e4m3fn,flydsl,1006,0,6.0654,flydsl_bpreshuflle_128x128x128_F8_F8_B16_1x0x0x3_default,309.8,1534.29,0.0 +gfx950,256,512,7168,8192,torch.float8_e4m3fn,ck,139,0,43.6511,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1377.5,1609.46,0.0 +gfx950,256,1024,7168,256,torch.float8_e4m3fn,flydsl,1182,0,8.8106,flydsl_bpreshuflle_64x128x256_F8_F8_B16_2x1x0x3_default,426.54,1904.21,0.0 +gfx950,256,1024,7168,512,torch.float8_e4m3fn,flydsl,16,0,11.3748,flydsl_bpreshuflle_64x256x128_F8_F8_B16_1x0x0x0_default,660.78,1659.31,0.0 +gfx950,256,1024,7168,8192,torch.float8_e4m3fn,ck,154,0,65.1203,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1846.72,1255.97,0.0 +gfx950,256,2048,7168,256,torch.float8_e4m3fn,flydsl,927,0,13.0037,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x2_default,578.0,2439.26,0.0 +gfx950,256,2048,7168,8192,torch.float8_e4m3fn,cktile,155,0,113.909,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2111.49,920.54,0.0 +gfx950,256,4096,7168,256,torch.float8_e4m3fn,flydsl,976,0,22.1568,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x1x2_default,678.45,2780.36,0.0 +gfx950,256,4096,7168,512,torch.float8_e4m3fn,flydsl,186,0,31.9569,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x0_default,940.79,2017.95,0.0 +gfx950,256,4096,7168,8192,torch.float8_e4m3fn,flydsl,979,0,212.6801,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2261.78,709.96,0.0 +gfx950,256,8192,7168,256,torch.float8_e4m3fn,flydsl,650,0,40.7412,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x1x1_default,737.95,2979.11,0.0 +gfx950,256,8192,7168,8192,torch.float8_e4m3fn,flydsl,825,0,384.7901,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2500.25,632.21,0.0 +gfx950,256,16384,7168,256,torch.float8_e4m3fn,flydsl,1205,0,75.8308,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x0x3_default,792.94,3176.95,0.0 +gfx950,256,16384,7168,512,torch.float8_e4m3fn,flydsl,1533,0,107.5937,flydsl_bpreshuflle_64x128x256_F8_F8_B16_2x1x1x4_default,1117.71,2295.11,0.0 +gfx950,256,16384,7168,8192,torch.float8_e4m3fn,flydsl,979,0,746.7473,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2576.7,572.91,0.0 +gfx950,256,32768,7168,8192,torch.float8_e4m3fn,ck,33,0,1495.0753,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2573.98,533.03,0.0 +gfx950,256,1,8192,1024,torch.float8_e4m3fn,flydsl,1,0,4.4273,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x0_default,3.79,1898.68,0.0 +gfx950,256,1,8192,2048,torch.float8_e4m3fn,flydsl,827,0,5.6823,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x1x0x2_default,5.91,2955.78,0.0 +gfx950,256,1,8192,3584,torch.float8_e4m3fn,ck,10,0,7.9608,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,7.38,3690.6,0.0 +gfx950,256,1,8192,7168,torch.float8_e4m3fn,flydsl,848,0,13.2795,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x2_default,8.84,4423.65,0.0 +gfx950,256,1,8192,8192,torch.float8_e4m3fn,ck,24,0,15.2454,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,8.8,4403.52,0.0 +gfx950,256,1,8192,28672,torch.float8_e4m3fn,flydsl,1159,0,45.1638,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x3_default,10.4,5201.65,0.0 +gfx950,256,16,8192,1024,torch.float8_e4m3fn,flydsl,1289,0,4.409,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x4_default,60.88,1965.78,0.0 +gfx950,256,16,8192,2048,torch.float8_e4m3fn,flydsl,1,0,5.5976,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x0_default,95.91,3049.9,0.0 +gfx950,256,16,8192,3584,torch.float8_e4m3fn,ck,10,0,7.841,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,119.82,3785.18,0.0 +gfx950,256,16,8192,7168,torch.float8_e4m3fn,ck,10,0,13.2832,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,141.46,4449.01,0.0 +gfx950,256,16,8192,8192,torch.float8_e4m3fn,ck,24,0,15.4927,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,138.61,4357.02,0.0 +gfx950,256,16,8192,28672,torch.float8_e4m3fn,ck,10,0,43.6421,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,172.22,5398.5,0.0 +gfx950,256,32,8192,1024,torch.float8_e4m3fn,flydsl,673,0,4.8647,flydsl_bpreshuflle_32x64x512_F8_F8_B16_1x0x0x2_default,110.36,1838.89,0.0 +gfx950,256,32,8192,2048,torch.float8_e4m3fn,ck,5,0,6.3939,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,167.93,2716.19,0.0 +gfx950,256,32,8192,3584,torch.float8_e4m3fn,ck,5,0,8.8319,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,212.76,3396.68,0.0 +gfx950,256,32,8192,7168,torch.float8_e4m3fn,ck,11,0,14.6614,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,256.33,4056.5,0.0 +gfx950,256,32,8192,8192,torch.float8_e4m3fn,ck,5,0,16.4179,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,261.6,4135.44,0.0 +gfx950,256,32,8192,28672,torch.float8_e4m3fn,ck,5,0,48.1626,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,312.12,4906.77,0.0 +gfx950,256,64,8192,2048,torch.float8_e4m3fn,cktile,37,0,7.4625,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,287.77,2406.28,0.0 +gfx950,256,64,8192,3584,torch.float8_e4m3fn,cktile,37,0,10.8507,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,346.35,2823.6,0.0 +gfx950,256,64,8192,7168,torch.float8_e4m3fn,cktile,37,0,17.4069,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,431.79,3459.98,0.0 +gfx950,256,64,8192,8192,torch.float8_e4m3fn,cktile,37,0,19.3604,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,443.69,3547.54,0.0 +gfx950,256,64,8192,28672,torch.float8_e4m3fn,cktile,151,0,56.6791,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_32x64x512_1x4x1_16x16x128_default,530.44,4194.93,0.0 +gfx950,256,128,8192,1024,torch.float8_e4m3fn,flydsl,677,0,6.3283,flydsl_bpreshuflle_64x64x256_F8_F8_B16_1x0x0x2_default,339.35,1677.68,0.0 +gfx950,256,128,8192,2048,torch.float8_e4m3fn,flydsl,869,0,9.304,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x1x0x2_default,461.63,2056.8,0.0 +gfx950,256,128,8192,3584,torch.float8_e4m3fn,ck,114,0,13.2791,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,566.02,2403.48,0.0 +gfx950,256,128,8192,7168,torch.float8_e4m3fn,ck,114,0,22.4532,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,669.5,2749.49,0.0 +gfx950,256,128,8192,8192,torch.float8_e4m3fn,ck,114,0,25.4862,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,674.09,2756.57,0.0 +gfx950,256,128,8192,28672,torch.float8_e4m3fn,ck,114,0,75.9032,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,792.19,3170.46,0.0 +gfx950,256,256,8192,1024,torch.float8_e4m3fn,flydsl,1128,0,8.3043,flydsl_bpreshuflle_128x64x256_F8_F8_B16_2x0x1x3_default,517.2,1546.8,0.0 +gfx950,256,256,8192,2048,torch.float8_e4m3fn,flydsl,1203,0,11.5596,flydsl_bpreshuflle_128x64x256_F8_F8_B16_2x1x0x3_default,743.1,1859.56,0.0 +gfx950,256,256,8192,3584,torch.float8_e4m3fn,ck,65,0,17.0541,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,881.45,2021.33,0.0 +gfx950,256,256,8192,7168,torch.float8_e4m3fn,ck,65,0,28.8311,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1042.79,2245.82,0.0 +gfx950,256,256,8192,8192,torch.float8_e4m3fn,ck,144,0,31.9715,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1074.7,2295.8,0.0 +gfx950,256,256,8192,28672,torch.float8_e4m3fn,ck,144,0,97.4398,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1234.19,2528.9,0.0 +gfx950,256,512,8192,1024,torch.float8_e4m3fn,flydsl,1418,0,11.2514,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x4_default,763.45,1537.72,0.0 +gfx950,256,512,8192,2048,torch.float8_e4m3fn,ck,139,0,15.9434,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1077.55,1644.22,0.0 +gfx950,256,512,8192,3584,torch.float8_e4m3fn,ck,139,0,23.5971,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1274.09,1677.48,0.0 +gfx950,256,512,8192,7168,torch.float8_e4m3fn,cktile,135,0,40.5772,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_128x128x256_1x4x1_16x16x128_default,1481.86,1744.3,0.0 +gfx950,256,512,8192,8192,torch.float8_e4m3fn,cktile,135,0,45.0861,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_128x128x256_1x4x1_16x16x128_default,1524.18,1767.55,0.0 +gfx950,256,512,8192,28672,torch.float8_e4m3fn,cktile,135,0,129.391,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_128x128x256_1x4x1_16x16x128_default,1858.85,1993.57,0.0 +gfx950,256,1024,8192,1024,torch.float8_e4m3fn,flydsl,925,0,16.7804,flydsl_bpreshuflle_128x128x128_F8_F8_B16_1x1x1x2_default,1023.81,1562.2,0.0 +gfx950,256,1024,8192,2048,torch.float8_e4m3fn,ck,154,0,25.4967,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1347.62,1398.28,0.0 +gfx950,256,1024,8192,3584,torch.float8_e4m3fn,ck,154,0,36.3958,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1652.1,1368.49,0.0 +gfx950,256,1024,8192,7168,torch.float8_e4m3fn,ck,154,0,61.9703,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1940.59,1336.73,0.0 +gfx950,256,1024,8192,8192,torch.float8_e4m3fn,ck,154,0,69.8202,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1968.47,1321.6,0.0 +gfx950,256,1024,8192,28672,torch.float8_e4m3fn,ck,154,0,201.8712,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2382.89,1392.07,0.0 +gfx950,256,2048,8192,1024,torch.float8_e4m3fn,flydsl,846,0,23.7296,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x2_default,1447.97,1855.92,0.0 +gfx950,256,2048,8192,2048,torch.float8_e4m3fn,flydsl,269,0,42.4634,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x0_default,1618.32,1284.07,0.0 +gfx950,256,2048,8192,3584,torch.float8_e4m3fn,flydsl,684,0,66.9636,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x0x0x2_default,1795.89,1049.15,0.0 +gfx950,256,2048,8192,7168,torch.float8_e4m3fn,ck,33,0,108.3244,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2220.35,987.36,0.0 +gfx950,256,2048,8192,8192,torch.float8_e4m3fn,cktile,155,0,122.2729,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2248.07,960.48,0.0 +gfx950,256,2048,8192,28672,torch.float8_e4m3fn,cktile,154,0,329.0964,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2923.38,994.1,0.0 +gfx950,256,4096,8192,1024,torch.float8_e4m3fn,flydsl,518,0,47.2123,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x1_default,1455.54,1687.95,0.0 +gfx950,256,4096,8192,2048,torch.float8_e4m3fn,flydsl,601,0,76.8953,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x1_default,1787.35,1200.0,0.0 +gfx950,256,4096,8192,3584,torch.float8_e4m3fn,flydsl,825,0,119.4876,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2012.91,930.21,0.0 +gfx950,256,4096,8192,7168,torch.float8_e4m3fn,flydsl,825,0,206.4213,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2330.36,751.81,0.0 +gfx950,256,4096,8192,8192,torch.float8_e4m3fn,flydsl,979,0,228.5549,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2405.36,734.06,0.0 +gfx950,256,4096,8192,28672,torch.float8_e4m3fn,cktile,10,0,643.0461,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2992.24,652.26,0.0 +gfx950,256,8192,8192,1024,torch.float8_e4m3fn,flydsl,186,0,95.7087,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x0_default,1436.01,1577.65,0.0 +gfx950,256,8192,8192,2048,torch.float8_e4m3fn,flydsl,927,0,146.7966,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x2_default,1872.51,1142.89,0.0 +gfx950,256,8192,8192,3584,torch.float8_e4m3fn,flydsl,825,0,216.5513,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2221.35,890.96,0.0 +gfx950,256,8192,8192,7168,torch.float8_e4m3fn,flydsl,825,0,387.3297,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2483.86,649.73,0.0 +gfx950,256,8192,8192,8192,torch.float8_e4m3fn,flydsl,979,0,435.3376,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2525.65,616.61,0.0 +gfx950,256,8192,8192,28672,torch.float8_e4m3fn,cktile,10,0,1294.8599,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2971.97,466.44,0.0 +gfx950,256,16384,8192,1024,torch.float8_e4m3fn,flydsl,927,0,181.2201,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x2_default,1516.82,1620.14,0.0 +gfx950,256,16384,8192,2048,torch.float8_e4m3fn,flydsl,103,0,287.1698,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x0x1x0_default,1914.39,1110.03,0.0 +gfx950,256,16384,8192,3584,torch.float8_e4m3fn,flydsl,825,0,427.2274,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2251.9,834.49,0.0 +gfx950,256,16384,8192,7168,torch.float8_e4m3fn,flydsl,825,0,768.6011,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2503.44,578.45,0.0 +gfx950,256,16384,8192,8192,torch.float8_e4m3fn,flydsl,979,0,871.056,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2524.55,539.3,0.0 +gfx950,256,16384,8192,28672,torch.float8_e4m3fn,cktile,154,0,2614.7771,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2943.49,372.15,0.0 +gfx950,256,32768,8192,1024,torch.float8_e4m3fn,ck,51,0,367.838,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1494.56,1573.56,0.0 +gfx950,256,32768,8192,2048,torch.float8_e4m3fn,ck,143,0,576.9388,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1905.77,1075.95,0.0 +gfx950,256,32768,8192,3584,torch.float8_e4m3fn,flydsl,825,0,864.7149,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2225.18,790.63,0.0 +gfx950,256,32768,8192,7168,torch.float8_e4m3fn,ck,33,0,1526.85,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2520.41,543.91,0.0 +gfx950,256,32768,8192,8192,torch.float8_e4m3fn,cktile,115,0,1726.1188,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2547.94,505.42,0.0 +gfx950,256,32768,8192,28672,torch.float8_e4m3fn,cktile,10,0,5232.9311,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2941.59,327.02,0.0 +gfx950,256,1,9216,16384,torch.float8_e4m3fn,flydsl,1159,0,28.55,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x3_default,10.58,5290.01,0.0 +gfx950,256,16,9216,16384,torch.float8_e4m3fn,ck,5,0,30.546,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,158.18,4961.44,0.0 +gfx950,256,32,9216,16384,torch.float8_e4m3fn,flydsl,857,0,34.613,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x1x0x2_default,279.19,4394.56,0.0 +gfx950,256,64,9216,16384,torch.float8_e4m3fn,flydsl,624,0,45.6522,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x1x1x1_default,423.36,3356.32,0.0 +gfx950,256,128,9216,16384,torch.float8_e4m3fn,cktile,75,0,53.8184,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_48x128x256_1x4x1_16x16x128_default,718.24,2888.44,0.0 +gfx950,256,256,9216,16384,torch.float8_e4m3fn,cktile,132,0,66.0848,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x128x256_1x4x1_16x16x128_default,1169.85,2419.74,0.0 +gfx950,256,512,9216,16384,torch.float8_e4m3fn,ck,149,0,92.2759,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1675.61,1829.52,0.0 +gfx950,256,1024,9216,16384,torch.float8_e4m3fn,cktile,233,0,142.3588,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_224x192x128_1x4x1_16x16x128_default,2172.24,1311.1,0.0 +gfx950,256,2048,9216,16384,torch.float8_e4m3fn,cktile,55,0,265.8551,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x192x128_1x4x1_16x16x128_default,2326.36,836.16,0.0 +gfx950,256,4096,9216,16384,torch.float8_e4m3fn,cktile,121,0,464.1345,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x128_default,2665.07,632.58,0.0 +gfx950,256,8192,9216,16384,torch.float8_e4m3fn,cktile,235,0,919.8305,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x192x128_1x4x1_16x16x128_default,2689.52,474.23,0.0 +gfx950,256,16384,9216,16384,torch.float8_e4m3fn,cktile,10,0,1743.5349,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2837.8,413.77,0.0 +gfx950,256,32768,9216,16384,torch.float8_e4m3fn,cktile,154,0,3492.6909,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2833.23,369.87,0.0 +gfx950,256,1,10240,8192,torch.float8_e4m3fn,flydsl,929,0,17.3869,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x1x2_default,9.65,4826.32,0.0 +gfx950,256,16,10240,8192,torch.float8_e4m3fn,ck,10,0,18.0502,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,148.72,4672.79,0.0 +gfx950,256,32,10240,8192,torch.float8_e4m3fn,cktile,37,0,20.9702,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,256.02,4044.0,0.0 +gfx950,256,64,10240,8192,torch.float8_e4m3fn,ck,114,0,25.7553,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,416.9,3328.29,0.0 +gfx950,256,128,10240,8192,torch.float8_e4m3fn,ck,120,0,31.1068,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,690.36,2814.69,0.0 +gfx950,256,256,10240,8192,torch.float8_e4m3fn,cktile,253,0,39.3727,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_96x128x256_1x4x1_16x16x128_default,1090.85,2316.99,0.0 +gfx950,256,512,10240,8192,torch.float8_e4m3fn,ck,143,0,56.8519,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1510.93,1733.74,0.0 +gfx950,256,1024,10240,8192,torch.float8_e4m3fn,ck,51,0,95.1718,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1805.14,1189.91,0.0 +gfx950,256,2048,10240,8192,torch.float8_e4m3fn,flydsl,1280,0,168.0196,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x1x1x3_default,2044.98,848.75,0.0 +gfx950,256,4096,10240,8192,torch.float8_e4m3fn,flydsl,979,0,280.4078,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2450.7,717.98,0.0 +gfx950,256,8192,10240,8192,torch.float8_e4m3fn,flydsl,979,0,533.7477,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2574.98,597.22,0.0 +gfx950,256,16384,10240,8192,torch.float8_e4m3fn,flydsl,979,0,1059.2396,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2595.05,522.68,0.0 +gfx950,256,32768,10240,8192,torch.float8_e4m3fn,ck,154,0,2180.7525,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2520.95,469.29,0.0 +gfx950,256,1,12800,5120,torch.float8_e4m3fn,flydsl,22,0,12.9565,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x0x0_default,10.12,5060.53,0.0 +gfx950,256,16,12800,5120,torch.float8_e4m3fn,flydsl,848,0,13.1569,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x2_default,159.4,5018.47,0.0 +gfx950,256,32,12800,5120,torch.float8_e4m3fn,flydsl,1018,0,14.4111,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x3_default,291.05,4615.82,0.0 +gfx950,256,64,12800,5120,torch.float8_e4m3fn,flydsl,541,0,18.6039,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x1x0x1_default,450.91,3628.38,0.0 +gfx950,256,128,12800,5120,torch.float8_e4m3fn,ck,65,0,22.6804,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,739.72,3062.92,0.0 +gfx950,256,256,12800,5120,torch.float8_e4m3fn,flydsl,152,0,31.0997,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x0_default,1078.93,2360.16,0.0 +gfx950,256,512,12800,5120,torch.float8_e4m3fn,ck,138,0,46.4049,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1446.16,1751.21,0.0 +gfx950,256,1024,12800,5120,torch.float8_e4m3fn,cktile,130,0,80.2052,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_224x128x128_1x4x1_16x16x128_default,1673.43,1209.31,0.0 +gfx950,256,2048,12800,5120,torch.float8_e4m3fn,ck,51,0,146.6395,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1830.58,875.96,0.0 +gfx950,256,4096,12800,5120,torch.float8_e4m3fn,flydsl,825,0,249.1087,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2155.17,768.2,0.0 +gfx950,256,8192,12800,5120,torch.float8_e4m3fn,flydsl,825,0,456.4241,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2352.51,694.96,0.0 +gfx950,256,16384,12800,5120,torch.float8_e4m3fn,flydsl,825,0,882.1332,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2434.42,644.86,0.0 +gfx950,256,32768,12800,5120,torch.float8_e4m3fn,cktile,115,0,1784.2978,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2407.09,600.89,0.0 +gfx950,256,1,13312,16384,torch.float8_e4m3fn,flydsl,686,0,39.3036,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x0x2_default,11.1,5550.3,0.0 +gfx950,256,16,13312,16384,torch.float8_e4m3fn,ck,25,0,39.8372,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,175.2,5492.15,0.0 +gfx950,256,32,13312,16384,torch.float8_e4m3fn,flydsl,529,0,40.8319,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x1x0x1_default,341.86,5375.21,0.0 +gfx950,256,64,13312,16384,torch.float8_e4m3fn,ck,114,0,50.5591,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,552.17,4368.28,0.0 +gfx950,256,128,13312,16384,torch.float8_e4m3fn,ck,65,0,61.0826,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,914.08,3660.76,0.0 +gfx950,256,256,13312,16384,torch.float8_e4m3fn,cktile,1,0,78.4817,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x256_1x4x1_16x16x128_default,1422.87,2919.33,0.0 +gfx950,256,512,13312,16384,torch.float8_e4m3fn,ck,154,0,119.7862,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1864.47,2004.6,0.0 +gfx950,256,1024,13312,16384,torch.float8_e4m3fn,cktile,10,0,188.278,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2372.43,1392.32,0.0 +gfx950,256,2048,13312,16384,torch.float8_e4m3fn,cktile,155,0,366.501,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2437.52,835.43,0.0 +gfx950,256,4096,13312,16384,torch.float8_e4m3fn,flydsl,825,0,704.7777,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2535.13,559.42,0.0 +gfx950,256,8192,13312,16384,torch.float8_e4m3fn,cktile,10,0,1317.8416,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2711.56,432.85,0.0 +gfx950,256,16384,13312,16384,torch.float8_e4m3fn,cktile,154,0,2517.5291,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2838.83,366.53,0.0 +gfx950,256,32768,13312,16384,torch.float8_e4m3fn,cktile,10,0,5030.6118,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2841.33,323.5,0.0 +gfx950,256,1,14336,8192,torch.float8_e4m3fn,ck,10,0,21.8553,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.75,5375.24,0.0 +gfx950,256,16,14336,8192,torch.float8_e4m3fn,ck,10,0,21.3952,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,175.65,5516.67,0.0 +gfx950,256,32,14336,8192,torch.float8_e4m3fn,flydsl,31,0,25.1546,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x0_default,298.8,4715.64,0.0 +gfx950,256,64,14336,8192,torch.float8_e4m3fn,ck,114,0,29.5305,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,509.05,4056.82,0.0 +gfx950,256,128,14336,8192,torch.float8_e4m3fn,ck,144,0,37.2517,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,807.07,3279.29,0.0 +gfx950,256,256,14336,8192,torch.float8_e4m3fn,cktile,29,0,46.2123,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x256_1x4x1_16x16x128_default,1301.16,2745.54,0.0 +gfx950,256,512,14336,8192,torch.float8_e4m3fn,ck,154,0,65.8134,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1827.27,2071.23,0.0 +gfx950,256,1024,14336,8192,torch.float8_e4m3fn,cktile,154,0,116.9719,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2056.2,1326.72,0.0 +gfx950,256,2048,14336,8192,torch.float8_e4m3fn,cktile,115,0,218.7174,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2199.35,882.13,0.0 +gfx950,256,4096,14336,8192,torch.float8_e4m3fn,flydsl,825,0,386.6129,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2488.47,694.33,0.0 +gfx950,256,8192,14336,8192,torch.float8_e4m3fn,flydsl,979,0,738.2272,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2606.44,568.16,0.0 +gfx950,256,16384,14336,8192,torch.float8_e4m3fn,flydsl,825,0,1489.9573,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2582.82,484.19,0.0 +gfx950,256,32768,14336,8192,torch.float8_e4m3fn,cktile,82,0,2971.0677,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_96x256x128_1x4x1_16x16x128_default,2590.51,446.1,0.0 +gfx950,256,1,16384,2048,torch.float8_e4m3fn,flydsl,333,0,7.7788,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x1_default,8.63,4318.05,0.0 +gfx950,256,1,16384,4096,torch.float8_e4m3fn,flydsl,520,0,14.2993,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x1_default,9.39,4695.74,0.0 +gfx950,256,1,16384,6656,torch.float8_e4m3fn,flydsl,1159,0,19.6315,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x3_default,11.11,5556.95,0.0 +gfx950,256,1,16384,8192,torch.float8_e4m3fn,ck,24,0,25.4479,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,10.55,5275.83,0.0 +gfx950,256,1,16384,13312,torch.float8_e4m3fn,flydsl,745,0,37.3614,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x1x2_default,11.68,5838.91,0.0 +gfx950,256,1,16384,26624,torch.float8_e4m3fn,cktile,144,0,71.1231,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x256_1x4x1_16x16x128_default,12.27,6133.97,0.0 +gfx950,256,4,16384,6656,torch.float8_e4m3fn,flydsl,1009,0,19.7247,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x0x3_default,44.23,5536.69,0.0 +gfx950,256,8,16384,6656,torch.float8_e4m3fn,flydsl,848,0,19.7962,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x2_default,88.14,5524.66,0.0 +gfx950,256,16,16384,2048,torch.float8_e4m3fn,flydsl,333,0,7.7788,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x1_default,138.03,4385.19,0.0 +gfx950,256,16,16384,4096,torch.float8_e4m3fn,ck,10,0,14.0016,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,153.37,4835.07,0.0 +gfx950,256,16,16384,6656,torch.float8_e4m3fn,flydsl,686,0,19.8218,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x0x2_default,176.05,5533.44,0.0 +gfx950,256,16,16384,8192,torch.float8_e4m3fn,ck,10,0,24.0843,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,178.33,5600.04,0.0 +gfx950,256,16,16384,13312,torch.float8_e4m3fn,flydsl,188,0,37.636,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x1x0x0_default,185.44,5814.67,0.0 +gfx950,256,16,16384,26624,torch.float8_e4m3fn,flydsl,354,0,71.2847,flydsl_bpreshuflle_16x64x512_F8_F8_B16_2x0x0x1_default,195.82,6132.56,0.0 +gfx950,256,32,16384,2048,torch.float8_e4m3fn,ck,12,0,8.8156,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,243.6,3932.64,0.0 +gfx950,256,32,16384,4096,torch.float8_e4m3fn,flydsl,1018,0,15.3879,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x3_default,279.11,4437.81,0.0 +gfx950,256,32,16384,6656,torch.float8_e4m3fn,flydsl,197,0,20.6179,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x1x0x0_default,338.51,5350.37,0.0 +gfx950,256,32,16384,8192,torch.float8_e4m3fn,cktile,9,0,27.6417,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x128_default,310.76,4903.04,0.0 +gfx950,256,32,16384,13312,torch.float8_e4m3fn,flydsl,363,0,38.3032,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x1_default,364.43,5732.64,0.0 +gfx950,256,32,16384,26624,torch.float8_e4m3fn,flydsl,363,0,71.7202,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x1_default,389.25,6108.57,0.0 +gfx950,256,64,16384,2048,torch.float8_e4m3fn,flydsl,869,0,10.0391,flydsl_bpreshuflle_64x64x256_F8_F8_B16_2x1x0x2_default,427.82,3564.33,0.0 +gfx950,256,64,16384,4096,torch.float8_e4m3fn,ck,114,0,19.0594,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,450.69,3644.82,0.0 +gfx950,256,64,16384,6656,torch.float8_e4m3fn,ck,114,0,24.5126,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,569.45,4551.74,0.0 +gfx950,256,64,16384,8192,torch.float8_e4m3fn,ck,119,0,31.4037,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,547.07,4357.42,0.0 +gfx950,256,64,16384,13312,torch.float8_e4m3fn,ck,114,0,44.9751,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,620.73,4915.01,0.0 +gfx950,256,64,16384,26624,torch.float8_e4m3fn,ck,114,0,83.2724,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,670.51,5283.97,0.0 +gfx950,256,128,16384,2048,torch.float8_e4m3fn,flydsl,1483,0,12.3136,flydsl_bpreshuflle_128x64x256_F8_F8_B16_2x1x0x4_default,697.6,3086.9,0.0 +gfx950,256,128,16384,4096,torch.float8_e4m3fn,ck,144,0,23.4825,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,731.6,3058.77,0.0 +gfx950,256,128,16384,6656,torch.float8_e4m3fn,ck,144,0,31.2393,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,893.66,3652.39,0.0 +gfx950,256,128,16384,8192,torch.float8_e4m3fn,cktile,24,0,38.2204,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x128_default,898.99,3648.85,0.0 +gfx950,256,128,16384,13312,torch.float8_e4m3fn,cktile,40,0,55.2899,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x64x256_1x4x1_16x16x128_default,1009.85,4051.41,0.0 +gfx950,256,128,16384,26624,torch.float8_e4m3fn,cktile,158,0,103.2711,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_128x64x256_1x4x1_16x16x128_default,1081.32,4297.52,0.0 +gfx950,256,256,16384,2048,torch.float8_e4m3fn,flydsl,1130,0,16.7273,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x3_default,1027.06,2538.8,0.0 +gfx950,256,256,16384,4096,torch.float8_e4m3fn,ck,139,0,28.7093,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1196.82,2666.25,0.0 +gfx950,256,256,16384,6656,torch.float8_e4m3fn,cktile,29,0,41.8226,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x256_1x4x1_16x16x128_default,1335.03,2848.81,0.0 +gfx950,256,256,16384,8192,torch.float8_e4m3fn,cktile,1,0,48.6422,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_128x128x256_1x4x1_16x16x128_default,1412.75,2974.85,0.0 +gfx950,256,256,16384,13312,torch.float8_e4m3fn,cktile,135,0,71.204,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_128x128x256_1x4x1_16x16x128_default,1568.3,3228.76,0.0 +gfx950,256,256,16384,26624,torch.float8_e4m3fn,cktile,29,0,131.8695,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_128x128x256_1x4x1_16x16x128_default,1693.63,3423.17,0.0 +gfx950,256,512,16384,2048,torch.float8_e4m3fn,flydsl,495,0,25.8961,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x1_default,1326.83,1984.09,0.0 +gfx950,256,512,16384,4096,torch.float8_e4m3fn,ck,154,0,43.5445,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1578.14,1974.61,0.0 +gfx950,256,512,16384,6656,torch.float8_e4m3fn,ck,154,0,60.6062,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1842.54,2132.41,0.0 +gfx950,256,512,16384,8192,torch.float8_e4m3fn,ck,154,0,69.0175,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1991.36,2248.55,0.0 +gfx950,256,512,16384,13312,torch.float8_e4m3fn,ck,154,0,106.2418,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2102.17,2274.97,0.0 +gfx950,256,512,16384,26624,torch.float8_e4m3fn,ck,154,0,195.6027,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2283.59,2385.53,0.0 +gfx950,256,1024,16384,2048,torch.float8_e4m3fn,flydsl,927,0,47.073,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x2_default,1459.85,1470.18,0.0 +gfx950,256,1024,16384,4096,torch.float8_e4m3fn,ck,107,0,77.2916,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1778.19,1356.65,0.0 +gfx950,256,1024,16384,6656,torch.float8_e4m3fn,ck,33,0,104.7702,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2131.7,1426.19,0.0 +gfx950,256,1024,16384,8192,torch.float8_e4m3fn,cktile,154,0,121.4869,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2262.61,1450.04,0.0 +gfx950,256,1024,16384,13312,torch.float8_e4m3fn,cktile,155,0,176.2123,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2534.88,1505.51,0.0 +gfx950,256,1024,16384,26624,torch.float8_e4m3fn,cktile,155,0,314.8139,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2837.72,1578.79,0.0 +gfx950,256,2048,16384,2048,torch.float8_e4m3fn,flydsl,601,0,80.3525,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x1_default,1710.45,1304.97,0.0 +gfx950,256,2048,16384,4096,torch.float8_e4m3fn,flydsl,979,0,134.1836,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2048.52,1062.77,0.0 +gfx950,256,2048,16384,6656,torch.float8_e4m3fn,ck,33,0,194.6083,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2295.26,975.25,0.0 +gfx950,256,2048,16384,8192,torch.float8_e4m3fn,ck,33,0,230.0845,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2389.36,947.93,0.0 +gfx950,256,2048,16384,13312,torch.float8_e4m3fn,cktile,155,0,339.3213,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2632.76,920.88,0.0 +gfx950,256,2048,16384,26624,torch.float8_e4m3fn,cktile,154,0,604.2855,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2956.73,923.14,0.0 +gfx950,256,4096,16384,2048,torch.float8_e4m3fn,flydsl,601,0,147.6909,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x1_default,1861.17,1192.77,0.0 +gfx950,256,4096,16384,4096,torch.float8_e4m3fn,flydsl,979,0,241.7976,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2273.62,902.01,0.0 +gfx950,256,4096,16384,6656,torch.float8_e4m3fn,flydsl,825,0,365.2692,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2445.74,740.64,0.0 +gfx950,256,4096,16384,8192,torch.float8_e4m3fn,flydsl,979,0,432.8399,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2540.23,697.69,0.0 +gfx950,256,4096,16384,13312,torch.float8_e4m3fn,cktile,155,0,665.018,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2686.7,611.78,0.0 +gfx950,256,4096,16384,26624,torch.float8_e4m3fn,cktile,155,0,1192.4676,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2996.65,569.81,0.0 +gfx950,256,8192,16384,2048,torch.float8_e4m3fn,flydsl,979,0,292.9922,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,1876.35,1087.97,0.0 +gfx950,256,8192,16384,4096,torch.float8_e4m3fn,flydsl,979,0,467.2444,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2353.18,789.95,0.0 +gfx950,256,8192,16384,6656,torch.float8_e4m3fn,flydsl,825,0,720.0273,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2481.44,600.0,0.0 +gfx950,256,8192,16384,8192,torch.float8_e4m3fn,flydsl,825,0,855.8274,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2569.47,548.9,0.0 +gfx950,256,8192,16384,13312,torch.float8_e4m3fn,cktile,155,0,1317.2113,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2712.86,452.16,0.0 +gfx950,256,8192,16384,26624,torch.float8_e4m3fn,cktile,10,0,2446.191,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2921.61,377.22,0.0 +gfx950,256,16384,16384,2048,torch.float8_e4m3fn,flydsl,979,0,573.0956,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,1918.55,1053.89,0.0 +gfx950,256,16384,16384,4096,torch.float8_e4m3fn,flydsl,979,0,945.5081,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2325.76,709.77,0.0 +gfx950,256,16384,16384,6656,torch.float8_e4m3fn,flydsl,825,0,1434.0728,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2491.79,526.45,0.0 +gfx950,256,16384,16384,8192,torch.float8_e4m3fn,cktile,115,0,1706.7491,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2576.86,471.84,0.0 +gfx950,256,16384,16384,13312,torch.float8_e4m3fn,cktile,10,0,2589.7115,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2759.7,375.75,0.0 +gfx950,256,16384,16384,26624,torch.float8_e4m3fn,cktile,155,0,4846.7561,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2949.12,290.77,0.0 +gfx950,256,32768,16384,2048,torch.float8_e4m3fn,ck,143,0,1121.2198,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1961.28,1047.44,0.0 +gfx950,256,32768,16384,4096,torch.float8_e4m3fn,flydsl,979,0,1893.374,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2322.86,673.44,0.0 +gfx950,256,32768,16384,6656,torch.float8_e4m3fn,cktile,115,0,2831.5792,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2523.97,494.74,0.0 +gfx950,256,32768,16384,8192,torch.float8_e4m3fn,cktile,115,0,3423.2243,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,2569.53,431.29,0.0 +gfx950,256,32768,16384,13312,torch.float8_e4m3fn,cktile,10,0,5107.7843,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2798.41,338.32,0.0 +gfx950,256,32768,16384,26624,torch.float8_e4m3fn,cktile,155,0,9941.6576,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2875.51,239.63,0.0 +gfx950,256,1,26624,16384,torch.float8_e4m3fn,flydsl,1138,0,75.2625,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x1x0x3_default,11.59,5796.74,0.0 +gfx950,256,16,26624,16384,torch.float8_e4m3fn,flydsl,0,0,75.7716,flydsl_bpreshuflle_16x64x256_F8_F8_B16_1x0x0x0_default,184.22,5771.58,0.0 +gfx950,256,32,26624,16384,torch.float8_e4m3fn,flydsl,1317,0,77.5854,flydsl_bpreshuflle_32x64x256_F8_F8_B16_2x0x0x4_default,359.83,5651.01,0.0 +gfx950,256,64,26624,16384,torch.float8_e4m3fn,flydsl,377,0,79.7028,flydsl_bpreshuflle_64x128x256_F8_F8_B16_2x0x0x1_default,700.53,5528.84,0.0 +gfx950,256,128,26624,16384,torch.float8_e4m3fn,flydsl,1418,0,97.1275,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x4_default,1149.72,4582.85,0.0 +gfx950,256,256,26624,16384,torch.float8_e4m3fn,ck,154,0,131.4447,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1699.1,3454.18,0.0 +gfx950,256,512,26624,16384,torch.float8_e4m3fn,cktile,155,0,195.1564,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2288.81,2417.85,0.0 +gfx950,256,1024,26624,16384,torch.float8_e4m3fn,cktile,154,0,369.4329,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,2418.17,1373.76,0.0 +gfx950,256,2048,26624,16384,torch.float8_e4m3fn,cktile,13,0,731.002,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_256x128x128_1x4x1_16x16x128_default,2444.19,791.81,0.0 +gfx950,256,4096,26624,16384,torch.float8_e4m3fn,cktile,10,0,1328.8625,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2689.08,542.89,0.0 +gfx950,256,8192,26624,16384,torch.float8_e4m3fn,cktile,155,0,2500.8282,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2857.78,402.52,0.0 +gfx950,256,16384,26624,16384,torch.float8_e4m3fn,cktile,10,0,4993.8298,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2862.26,315.8,0.0 +gfx942,304,64,1536,5120,torch.int8,asm,0,1,16.7759,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,60.0,500.04,0.0 +gfx942,304,64,5120,1280,torch.int8,asm,0,1,7.7424,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,108.35,941.68,0.0 +gfx942,304,128,1536,5120,torch.int8,asm,0,1,17.9602,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,112.1,496.26,0.0 +gfx942,304,128,5120,1280,torch.int8,asm,1,1,8.5797,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,195.55,935.72,0.0 +gfx942,304,256,1536,5120,torch.int8,asm,0,1,17.6791,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,227.76,563.46,0.0 +gfx942,304,256,5120,1280,torch.int8,asm,2,1,9.2104,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_48x128E,364.31,1031.74,0.0 +gfx942,304,512,1536,5120,torch.int8,asm,1,1,18.7067,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,430.49,644.62,0.0 +gfx942,304,512,5120,1280,torch.int8,asm,4,1,11.2508,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,596.48,1106.75,0.0 +gfx942,304,1024,1536,5120,torch.int8,asm,2,1,21.0125,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_48x128E,766.5,773.49,0.0 +gfx942,304,1024,5120,1280,torch.int8,asm,6,1,18.2637,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,734.89,1004.73,0.0 +gfx942,304,1664,1536,5120,torch.int8,asm,4,1,27.6261,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,947.38,778.1,0.0 +gfx942,304,1664,5120,1280,torch.int8,asm,6,1,21.5241,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1013.3,1195.07,0.0 +gfx942,304,4096,1536,5120,torch.int8,asm,6,1,63.8361,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1009.22,648.83,0.0 +gfx942,304,4096,5120,1280,torch.int8,asm,6,1,58.5804,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,916.47,917.36,0.0 +gfx942,304,8192,1536,5120,torch.int8,asm,6,1,128.8567,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,999.94,581.83,0.0 +gfx942,304,8192,5120,1280,torch.int8,asm,6,1,105.657,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1016.25,955.22,0.0 +gfx942,304,10240,1536,5120,torch.int8,asm,6,1,142.1441,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1133.08,645.47,0.0 +gfx942,304,10240,5120,1280,torch.int8,asm,6,1,133.9571,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1001.95,929.54,0.0 +gfx942,304,12288,1536,5120,torch.int8,asm,7,1,185.0056,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,1044.69,586.62,0.0 +gfx942,304,12288,5120,1280,torch.int8,asm,6,1,153.8919,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1046.59,962.44,0.0 +gfx942,304,16384,1536,5120,torch.int8,asm,6,1,211.7054,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1217.25,671.13,0.0 +gfx942,304,16384,5120,1280,torch.int8,asm,6,1,195.4489,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1098.74,999.22,0.0 +gfx942,304,20480,1536,5120,torch.int8,asm,6,1,271.7016,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1185.57,646.43,0.0 +gfx942,304,20480,5120,1280,torch.int8,asm,6,1,245.7639,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1092.25,986.65,0.0 +gfx942,304,24576,1536,5120,torch.int8,asm,6,1,328.3654,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1177.19,637.07,0.0 +gfx942,304,24576,5120,1280,torch.int8,asm,6,1,290.5831,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1108.54,996.85,0.0 +gfx942,304,30720,1536,5120,torch.int8,asm,6,1,402.8387,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1199.45,644.23,0.0 +gfx942,304,30720,5120,1280,torch.int8,asm,6,1,361.4235,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1114.08,997.3,0.0 +gfx942,304,32768,1536,5120,torch.int8,asm,6,1,406.391,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1268.23,679.89,0.0 +gfx942,304,32768,5120,1280,torch.int8,asm,6,1,379.1231,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1132.87,1012.97,0.0 +gfx942,304,40960,1536,5120,torch.int8,asm,6,1,525.9013,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1225.03,652.99,0.0 +gfx942,304,40960,5120,1280,torch.int8,asm,6,1,475.7676,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1128.43,1005.56,0.0 +gfx950,256,32768,26624,16384,torch.float8_e4m3fn,cktile,10,0,10041.205,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2847.0,270.68,0.0 +gfx950,256,1,51200,5120,torch.float8_e4m3fn,flydsl,26,0,44.5486,flydsl_bpreshuflle_16x256x512_F8_F8_B16_2x0x0x0_default,11.77,5886.86,0.0 +gfx950,256,16,51200,5120,torch.float8_e4m3fn,flydsl,26,0,45.3929,flydsl_bpreshuflle_16x256x512_F8_F8_B16_2x0x0x0_default,184.8,5812.9,0.0 +gfx950,256,32,51200,5120,torch.float8_e4m3fn,flydsl,918,0,47.6289,flydsl_bpreshuflle_32x128x256_F8_F8_B16_1x1x1x2_default,352.25,5576.12,0.0 +gfx950,256,64,51200,5120,torch.float8_e4m3fn,flydsl,463,0,50.7771,flydsl_bpreshuflle_64x256x128_F8_F8_B16_2x0x1x1_default,660.82,5298.16,0.0 +gfx950,256,128,51200,5120,torch.float8_e4m3fn,flydsl,155,0,60.4343,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x0x1x0_default,1110.44,4565.4,0.0 +gfx950,256,256,51200,5120,torch.float8_e4m3fn,flydsl,825,0,88.2861,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,1520.26,3281.03,0.0 +gfx950,256,512,51200,5120,torch.float8_e4m3fn,flydsl,825,0,154.5155,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,1737.27,2052.83,0.0 +gfx950,256,1024,51200,5120,torch.float8_e4m3fn,flydsl,825,0,258.4335,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2077.4,1440.39,0.0 +gfx950,256,2048,51200,5120,torch.float8_e4m3fn,flydsl,825,0,484.2625,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2217.27,996.04,0.0 +gfx950,256,4096,51200,5120,torch.float8_e4m3fn,flydsl,825,0,896.4303,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2395.59,783.72,0.0 +gfx950,256,8192,51200,5120,torch.float8_e4m3fn,flydsl,825,0,1732.5641,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2478.97,659.69,0.0 +gfx950,256,16384,51200,5120,torch.float8_e4m3fn,flydsl,825,0,3450.4561,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2489.51,586.52,0.0 +gfx950,256,32768,51200,5120,torch.float8_e4m3fn,flydsl,825,0,7359.6237,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2334.34,514.34,0.0 +gfx950,256,1,53248,16384,torch.float8_e4m3fn,ck,7,0,149.1782,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,11.7,5848.97,0.0 +gfx950,256,16,53248,16384,torch.float8_e4m3fn,ck,7,0,149.5025,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,186.73,5848.61,0.0 +gfx950,256,32,53248,16384,torch.float8_e4m3fn,flydsl,1023,0,149.5234,flydsl_bpreshuflle_32x256x128_F8_F8_B16_2x0x0x3_default,373.42,5860.94,0.0 +gfx950,256,64,53248,16384,torch.float8_e4m3fn,flydsl,48,0,152.6856,flydsl_bpreshuflle_64x256x128_F8_F8_B16_2x0x0x0_default,731.37,5765.31,0.0 +gfx950,256,128,53248,16384,torch.float8_e4m3fn,flydsl,155,0,169.9609,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x0x1x0_default,1314.06,5225.58,0.0 +gfx950,256,256,53248,16384,torch.float8_e4m3fn,cktile,154,0,225.5251,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_256x256x128_1x4x1_16x16x128_default,1980.61,4007.86,0.0 +gfx950,256,512,53248,16384,torch.float8_e4m3fn,cktile,155,0,382.9747,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2332.67,2442.28,0.0 +gfx950,256,1024,53248,16384,torch.float8_e4m3fn,cktile,13,0,751.8658,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_256x128x128_1x4x1_16x16x128_default,2376.36,1327.69,0.0 +gfx950,256,2048,53248,16384,torch.float8_e4m3fn,cktile,10,0,1326.5153,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2693.83,847.39,0.0 +gfx950,256,4096,53248,16384,torch.float8_e4m3fn,cktile,155,0,2515.1053,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_256x256x128_1x4x1_16x16x128_default,2841.56,546.99,0.0 +gfx950,256,8192,53248,16384,torch.float8_e4m3fn,cktile,10,0,4995.5375,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2861.28,376.15,0.0 +gfx950,256,16384,53248,16384,torch.float8_e4m3fn,cktile,10,0,10019.6764,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_256x256x128_1x4x1_16x16x128_default,2853.12,288.0,0.0 +gfx950,256,32768,53248,16384,torch.float8_e4m3fn,ck,107,0,20797.6873,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2749.08,235.55,0.0 +gfx950,256,1,57344,8192,torch.float8_e4m3fn,flydsl,254,0,77.9504,flydsl_bpreshuflle_16x256x512_F8_F8_B16_1x1x1x0_default,12.05,6028.0,0.0 +gfx950,256,16,57344,8192,torch.float8_e4m3fn,flydsl,88,0,78.2705,flydsl_bpreshuflle_16x256x512_F8_F8_B16_1x0x1x0_default,192.06,6026.9,0.0 +gfx950,256,32,57344,8192,torch.float8_e4m3fn,flydsl,1499,0,81.879,flydsl_bpreshuflle_32x128x128_F8_F8_B16_1x1x1x4_default,367.19,5785.3,0.0 +gfx950,256,64,57344,8192,torch.float8_e4m3fn,flydsl,381,0,83.2587,flydsl_bpreshuflle_64x256x256_F8_F8_B16_2x0x0x1_default,722.2,5736.65,0.0 +gfx950,256,128,57344,8192,torch.float8_e4m3fn,flydsl,72,0,93.9295,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x0x0x0_default,1280.31,5168.67,0.0 +gfx950,256,256,57344,8192,torch.float8_e4m3fn,ck,33,0,134.0184,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1794.67,3739.93,0.0 +gfx950,256,512,57344,8192,torch.float8_e4m3fn,ck,33,0,225.9295,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2129.14,2357.71,0.0 +gfx950,256,1024,57344,8192,torch.float8_e4m3fn,flydsl,825,0,419.9967,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2290.67,1418.09,0.0 +gfx950,256,2048,57344,8192,torch.float8_e4m3fn,ck,33,0,782.0667,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2460.33,922.45,0.0 +gfx950,256,4096,57344,8192,torch.float8_e4m3fn,flydsl,979,0,1486.3101,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,2589.16,654.69,0.0 +gfx950,256,8192,57344,8192,torch.float8_e4m3fn,flydsl,825,0,2916.8532,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2638.66,506.16,0.0 +gfx950,256,16384,57344,8192,torch.float8_e4m3fn,flydsl,825,0,5899.997,flydsl_bpreshuflle_256x128x128_F8_F8_B16_2x0x1x2_default,2609.01,420.85,0.0 +gfx950,256,32768,57344,8192,torch.float8_e4m3fn,ck,33,0,12218.2137,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2519.71,368.0,0.0 diff --git a/aiter/configs/a8w8_tuned_batched_gemm.csv b/aiter/configs/a8w8_tuned_batched_gemm.csv index 2fb46f19cb..4762445b29 100644 --- a/aiter/configs/a8w8_tuned_batched_gemm.csv +++ b/aiter/configs/a8w8_tuned_batched_gemm.csv @@ -1,27 +1,27 @@ -cu_num,B,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio -304,16,32,1280,8192,28,0,68.9821,a8w8_batched_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,155.6551,5004.8295,0.0 -304,16,64,1280,8192,21,0,74.9374,a8w8_batched_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,286.5703,4736.5264,0.0 -304,16,128,1280,8192,41,0,111.2581,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,386.0364,3364.6236,0.0 -304,16,192,1280,8192,11,0,136.9273,a8w8_batched_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,470.5016,2875.5426,0.0 -304,16,256,1280,8192,11,0,150.6582,a8w8_batched_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,570.1604,2742.2267,0.0 -304,16,320,1280,8192,41,0,194.5238,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,551.9848,2223.5716,0.0 -304,16,512,1280,8192,4,0,235.9793,a8w8_batched_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,728.0244,2079.5619,0.0 -304,16,1024,1280,8192,4,0,457.3867,a8w8_batched_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,751.2186,1412.2029,0.0 -304,16,2048,1280,8192,13,0,831.9798,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,825.9753,1149.4285,0.0 -304,16,4096,1280,8192,39,0,1490.3195,a8w8_batched_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,922.2113,1058.2015,0.0 -304,16,8192,1280,8192,1,0,2894.8037,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,949.5563,973.6661,0.0 -304,16,16384,1280,8192,1,0,5696.639,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,965.0529,930.6541,0.0 -304,16,1,8192,1024,78,0,37.703,a8w8_batched_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,7.1197,7127.5593,0.0 -304,16,32,8192,1024,62,0,46.8522,a8w8_batched_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_intrawave_v2,183.3411,5930.8344,0.0 -304,16,64,8192,1024,47,0,56.4451,a8w8_batched_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,304.3642,5090.0756,0.0 -304,16,128,8192,1024,13,0,78.8949,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,435.5128,3880.9124,0.0 -304,16,192,8192,1024,39,0,113.2351,a8w8_batched_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,455.1558,2870.6519,0.0 -304,16,256,8192,1024,13,0,127.391,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,539.4375,2699.8212,0.0 -304,16,320,8192,1024,13,0,172.9103,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,496.7856,2098.2399,0.0 -304,16,512,8192,1024,13,0,229.5169,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,598.8184,1827.4489,0.0 -304,16,1024,8192,1024,13,0,426.5342,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,644.4452,1337.3496,0.0 -304,16,2048,8192,1024,13,0,823.4174,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,667.6514,1059.5055,0.0 -304,16,4096,8192,1024,1,0,1583.6971,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,694.2689,932.2458,0.0 -304,16,8192,8192,1024,13,0,3131.9626,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,702.1231,857.0838,0.0 -304,16,16384,8192,1024,1,0,6094.2926,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,721.6665,836.8935,0.0 -80,16,1,1280,8192,78,0,86.7259,a8w8_batched_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,3.869,3872.5159,0.0 +gfx,cu_num,B,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx942,304,16,32,1280,8192,28,0,68.9821,a8w8_batched_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,155.6551,5004.8295,0.0 +gfx942,304,16,64,1280,8192,21,0,74.9374,a8w8_batched_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,286.5703,4736.5264,0.0 +gfx942,304,16,128,1280,8192,41,0,111.2581,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,386.0364,3364.6236,0.0 +gfx942,304,16,192,1280,8192,11,0,136.9273,a8w8_batched_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,470.5016,2875.5426,0.0 +gfx942,304,16,256,1280,8192,11,0,150.6582,a8w8_batched_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,570.1604,2742.2267,0.0 +gfx942,304,16,320,1280,8192,41,0,194.5238,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,551.9848,2223.5716,0.0 +gfx942,304,16,512,1280,8192,4,0,235.9793,a8w8_batched_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,728.0244,2079.5619,0.0 +gfx942,304,16,1024,1280,8192,4,0,457.3867,a8w8_batched_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,751.2186,1412.2029,0.0 +gfx942,304,16,2048,1280,8192,13,0,831.9798,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,825.9753,1149.4285,0.0 +gfx942,304,16,4096,1280,8192,39,0,1490.3195,a8w8_batched_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,922.2113,1058.2015,0.0 +gfx942,304,16,8192,1280,8192,1,0,2894.8037,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,949.5563,973.6661,0.0 +gfx942,304,16,16384,1280,8192,1,0,5696.639,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,965.0529,930.6541,0.0 +gfx942,304,16,1,8192,1024,78,0,37.703,a8w8_batched_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,7.1197,7127.5593,0.0 +gfx942,304,16,32,8192,1024,62,0,46.8522,a8w8_batched_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_intrawave_v2,183.3411,5930.8344,0.0 +gfx942,304,16,64,8192,1024,47,0,56.4451,a8w8_batched_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,304.3642,5090.0756,0.0 +gfx942,304,16,128,8192,1024,13,0,78.8949,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,435.5128,3880.9124,0.0 +gfx942,304,16,192,8192,1024,39,0,113.2351,a8w8_batched_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,455.1558,2870.6519,0.0 +gfx942,304,16,256,8192,1024,13,0,127.391,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,539.4375,2699.8212,0.0 +gfx942,304,16,320,8192,1024,13,0,172.9103,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,496.7856,2098.2399,0.0 +gfx942,304,16,512,8192,1024,13,0,229.5169,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,598.8184,1827.4489,0.0 +gfx942,304,16,1024,8192,1024,13,0,426.5342,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,644.4452,1337.3496,0.0 +gfx942,304,16,2048,8192,1024,13,0,823.4174,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,667.6514,1059.5055,0.0 +gfx942,304,16,4096,8192,1024,1,0,1583.6971,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,694.2689,932.2458,0.0 +gfx942,304,16,8192,8192,1024,13,0,3131.9626,a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,702.1231,857.0838,0.0 +gfx942,304,16,16384,8192,1024,1,0,6094.2926,a8w8_batched_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,721.6665,836.8935,0.0 +gfx942,80,16,1,1280,8192,78,0,86.7259,a8w8_batched_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,3.869,3872.5159,0.0 diff --git a/aiter/configs/a8w8_tuned_gemm.csv b/aiter/configs/a8w8_tuned_gemm.csv index 69b3fd2d5b..f503e2ddab 100644 --- a/aiter/configs/a8w8_tuned_gemm.csv +++ b/aiter/configs/a8w8_tuned_gemm.csv @@ -1,583 +1,583 @@ -cu_num,M,N,K,q_dtype_w,kernelId,splitK,us,kernelName,tflops,bw,errRatio -80,1,1280,8192,torch.int8,34,0,20.5611,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1.02,510.5,0.0 -80,1,8192,1024,torch.int8,78,0,9.1327,a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,1.84,920.43,0.0 -80,32,1280,8192,torch.int8,34,0,19.9841,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,33.58,541.92,0.0 -80,32,8192,1024,torch.int8,28,0,9.9382,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,54.02,900.13,0.0 -80,64,1280,8192,torch.int8,34,0,19.6589,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,68.27,568.39,0.0 -80,64,8192,1024,torch.int8,21,0,13.6554,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,78.63,695.89,0.0 -80,128,1280,8192,torch.int8,30,0,23.3376,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,115.02,508.28,0.0 -80,128,8192,1024,torch.int8,45,0,20.3684,a8w8_rowwise_256x128x64x128_32x32_2x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,105.43,521.24,0.0 -80,192,1280,8192,torch.int8,23,0,33.0028,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,122.01,380.28,0.0 -80,192,8192,1024,torch.int8,47,0,26.1312,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,123.27,448.92,0.0 -80,256,1280,8192,torch.int8,23,0,33.3204,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,161.12,397.3,0.0 -80,256,8192,1024,torch.int8,13,0,29.7436,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,144.4,431.86,0.0 -80,320,1280,8192,torch.int8,21,0,54.4822,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,123.18,255.61,0.0 -80,320,8192,1024,torch.int8,47,0,39.6412,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,135.43,352.14,0.0 -80,512,1280,8192,torch.int8,21,0,55.0378,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,195.09,290.54,0.0 -80,512,8192,1024,torch.int8,13,0,52.947,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,162.24,326.77,0.0 -80,1024,1280,8192,torch.int8,13,0,88.6516,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,242.24,242.48,0.0 -80,1024,8192,1024,torch.int8,13,0,90.2628,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,190.33,290.42,0.0 -80,2048,1280,8192,torch.int8,13,0,140.7575,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,305.13,230.94,0.0 -80,2048,8192,1024,torch.int8,41,0,165.9552,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,207.04,265.37,0.0 -80,4096,1280,8192,torch.int8,13,0,275.3109,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,312.01,198.05,0.0 -80,4096,8192,1024,torch.int8,13,0,305.9089,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,224.64,260.51,0.0 -80,8192,1280,8192,torch.int8,13,0,545.0371,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,315.21,180.84,0.0 -80,8192,8192,1024,torch.int8,13,0,598.5803,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,229.61,252.26,0.0 -80,16384,1280,8192,torch.int8,13,0,1075.9373,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,319.35,173.47,0.0 -80,16384,8192,1024,torch.int8,13,0,1176.7808,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,233.58,249.5,0.0 -256,1,100,5120,torch.int8,34,0,8.7502,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.12,59.12,0.0 -256,1,200,5120,torch.int8,34,0,9.0193,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.23,114.15,0.0 -256,1,800,5120,torch.int8,34,0,11.5471,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.71,355.3,0.0 -256,1,1280,8192,torch.int8,34,0,17.6914,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1.19,593.31,0.0 -256,1,1280,8192,torch.float8_e4m3fn,79,3,7.6797,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,2.73,1366.79,0.1344 -256,1,2304,16384,torch.int8,34,0,28.7938,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2.62,1311.73,0.0 -256,1,2560,8192,torch.int8,34,0,17.4658,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2.4,1201.48,0.0 -256,1,4608,16384,torch.int8,34,0,30.4899,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,4.95,2476.99,0.0 -256,1,5120,640,torch.int8,79,0,4.3227,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,1.52,760.56,0.0 -256,1,5120,1280,torch.int8,34,0,6.1049,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2.15,1075.39,0.0 -256,1,5120,3200,torch.int8,30,0,9.8138,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,3.34,1670.86,0.0 -256,1,5120,5120,torch.int8,34,0,12.3186,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,4.26,2129.28,0.0 -256,1,5120,6400,torch.int8,34,0,14.5779,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,4.5,2248.93,0.0 -256,1,5120,25600,torch.int8,23,0,47.1407,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,5.56,2781.2,0.0 -256,1,6400,5120,torch.int8,34,0,12.7506,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,5.14,2571.32,0.0 -256,1,7168,8192,torch.int8,34,0,19.015,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,6.18,3089.29,0.0 -256,1,8192,1024,torch.int8,79,0,5.7218,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,2.93,1469.12,0.0 -256,1,8192,1024,torch.float8_e4m3fn,60,0,6.3982,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,2.62,1313.81,0.0 -256,1,8192,2048,torch.int8,34,0,7.1625,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,4.68,2344.94,0.0 -256,1,8192,3584,torch.int8,34,0,11.2488,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,5.22,2611.84,0.0 -256,1,8192,7168,torch.int8,34,0,16.9371,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,6.93,3468.35,0.0 -256,1,8192,8192,torch.int8,34,0,19.4709,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,6.89,3447.89,0.0 -256,1,8192,28672,torch.int8,34,0,59.8294,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,7.85,3926.6,0.0 -256,1,9216,16384,torch.int8,34,0,34.1179,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,8.85,4426.7,0.0 -256,1,10240,8192,torch.int8,34,0,20.5574,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,8.16,4081.97,0.0 -256,1,12800,5120,torch.int8,34,0,14.7015,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,8.92,4459.87,0.0 -256,1,13312,16384,torch.int8,34,0,43.9839,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,9.92,4959.7,0.0 -256,1,16384,2048,torch.int8,34,0,8.2219,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,8.16,4085.34,0.0 -256,1,16384,4096,torch.int8,34,0,15.9093,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,8.44,4220.53,0.0 -256,1,16384,6656,torch.int8,34,0,22.3992,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,9.74,4870.32,0.0 -256,1,16384,8192,torch.int8,34,0,29.3945,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,9.13,4567.48,0.0 -256,1,16384,13312,torch.int8,34,0,39.0876,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,11.16,5581.05,0.0 -256,1,16384,26624,torch.int8,34,0,73.9812,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,11.79,5897.0,0.0 -256,1,26624,16384,torch.int8,34,0,88.2574,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,9.88,4943.24,0.0 -256,1,51200,5120,torch.int8,30,0,50.042,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,10.48,5240.63,0.0 -256,1,53248,16384,torch.int8,57,0,170.1222,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_intrawave_v2,10.26,5128.89,0.0 -256,1,57344,8192,torch.int8,76,0,88.899,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,10.57,5285.6,0.0 -256,16,100,5120,torch.int8,34,0,8.7447,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1.87,68.28,0.0 -256,16,200,5120,torch.int8,34,0,10.4705,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,3.13,106.23,0.0 -256,16,800,5120,torch.int8,34,0,11.7219,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,11.18,358.6,0.0 -256,16,1024,8192,torch.float8_e4m3fn,79,3,8.199,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,32.74,1043.11,0.1616 -256,16,1280,8192,torch.int8,34,0,15.0987,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.22,705.87,0.0 -256,16,1280,8192,torch.float8_e4m3fn,79,3,8.8968,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,37.72,1197.94,0.1643 -256,16,2304,16384,torch.int8,34,0,23.498,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,51.41,1620.76,0.0 -256,16,2560,8192,torch.int8,34,0,14.8202,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,45.28,1429.43,0.0 -256,16,3584,8192,torch.float8_e4m3fn,34,2,10.689,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,87.9,2769.75,0.1092 -256,16,4608,16384,torch.int8,34,0,25.7089,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,93.97,2952.56,0.0 -256,16,5120,640,torch.int8,60,0,4.3842,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,23.92,787.12,0.0 -256,16,5120,1280,torch.int8,79,0,6.1038,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,34.36,1103.89,0.0 -256,16,5120,3200,torch.int8,79,0,9.1289,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,57.43,1818.3,0.0 -256,16,5120,5120,torch.int8,34,0,11.1363,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,75.33,2376.03,0.0 -256,16,5120,6400,torch.int8,34,0,14.8838,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,70.45,2219.48,0.0 -256,16,5120,25600,torch.int8,34,0,38.7631,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,108.2,3396.15,0.0 -256,16,6400,5120,torch.int8,34,0,11.5523,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,90.77,2861.31,0.0 -256,16,7168,8192,torch.int8,34,0,16.6523,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,112.84,3547.9,0.0 -256,16,7168,8192,torch.float8_e4m3fn,80,2,15.5438,a8w8_rowwise_128x16x64x128_16x16_1x2_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,120.89,3800.92,0.1089 -256,16,7424,8192,torch.float8_e4m3fn,60,2,16.1728,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,120.34,3783.27,0.1076 -256,16,8192,1024,torch.int8,79,0,5.4586,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,49.18,1587.79,0.0 -256,16,8192,2048,torch.int8,79,0,7.2552,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,74.0,2353.09,0.0 -256,16,8192,3584,torch.int8,34,0,10.6477,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,88.24,2787.42,0.0 -256,16,8192,7168,torch.int8,34,0,15.2131,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,123.52,3884.62,0.0 -256,16,8192,8192,torch.int8,34,0,17.2283,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,124.65,3918.09,0.0 -256,16,8192,8192,torch.float8_e4m3fn,80,2,17.7026,a8w8_rowwise_128x16x64x128_16x16_1x2_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,121.31,3813.12,0.107 -256,16,8192,28672,torch.int8,34,0,55.9502,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,134.34,4210.92,0.0 -256,16,9216,16384,torch.int8,34,0,33.174,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,145.65,4568.4,0.0 -256,16,10240,8192,torch.int8,34,0,18.6072,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,144.26,4532.91,0.0 -256,16,10240,8192,torch.float8_e4m3fn,34,0,19.5591,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,137.24,4312.31,0.0 -256,16,12800,5120,torch.int8,34,0,14.2563,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,147.1,4631.46,0.0 -256,16,13312,16384,torch.int8,30,0,45.4589,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,153.53,4812.96,0.0 -256,16,16384,2048,torch.int8,34,0,8.2068,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,130.84,4156.49,0.0 -256,16,16384,4096,torch.int8,30,0,16.22,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,132.4,4173.78,0.0 -256,16,16384,6656,torch.int8,34,0,21.7401,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,160.52,5045.18,0.0 -256,16,16384,8192,torch.int8,30,0,29.6526,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,144.84,4548.44,0.0 -256,16,16384,13312,torch.int8,34,0,38.6763,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,180.45,5658.27,0.0 -256,16,16384,26624,torch.int8,34,0,73.7744,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,189.21,5925.6,0.0 -256,16,26624,16384,torch.int8,30,0,86.8898,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,160.65,5033.06,0.0 -256,16,28672,8192,torch.float8_e4m3fn,30,0,46.9441,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,160.11,5025.76,0.0 -256,16,51200,5120,torch.int8,30,0,51.7349,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,162.15,5100.32,0.0 -256,16,53248,16384,torch.int8,34,0,180.7857,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,154.42,4836.56,0.0 -256,16,57344,8192,torch.int8,80,0,90.9234,a8w8_rowwise_128x16x64x128_16x16_1x2_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,165.33,5188.19,0.0 -256,16,57344,8192,torch.float8_e4m3fn,69,2,92.3442,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v1,162.79,5108.37,0.1072 -256,32,100,5120,torch.int8,34,0,9.2459,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,3.54,73.79,0.0 -256,32,200,5120,torch.int8,34,0,10.4825,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,6.25,114.54,0.0 -256,32,800,5120,torch.int8,34,0,11.9874,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,21.87,359.63,0.0 -256,32,1024,8192,torch.float8_e4m3fn,79,3,8.7911,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,61.07,991.49,0.1586 -256,32,1280,8192,torch.int8,34,0,14.9955,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,44.75,722.2,0.0 -256,32,1280,8192,torch.float8_e4m3fn,60,3,9.3466,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,71.8,1158.69,0.1581 -256,32,2304,16384,torch.int8,34,0,23.2436,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,103.94,1652.95,0.0 -256,32,2560,8192,torch.int8,34,0,14.7991,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,90.69,1445.87,0.0 -256,32,3584,8192,torch.float8_e4m3fn,34,2,12.64,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,148.66,2361.68,0.1084 -256,32,4608,16384,torch.int8,34,0,24.957,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,193.61,3057.93,0.0 -256,32,5120,640,torch.int8,79,0,4.5986,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,45.6,788.27,0.0 -256,32,5120,1280,torch.int8,76,0,6.0464,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,69.37,1144.85,0.0 -256,32,5120,3200,torch.int8,76,0,10.1191,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,103.62,1661.62,0.0 -256,32,5120,5120,torch.int8,34,0,10.4248,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,160.94,2561.77,0.0 -256,32,5120,6400,torch.int8,34,0,15.0885,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,138.99,2207.01,0.0 -256,32,5120,25600,torch.int8,34,0,38.3258,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,218.88,3449.87,0.0 -256,32,6400,5120,torch.int8,34,0,10.9816,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,190.97,3036.12,0.0 -256,32,7168,8192,torch.int8,34,0,16.942,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,221.82,3508.51,0.0 -256,32,7168,8192,torch.float8_e4m3fn,34,0,17.4961,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,214.8,3397.39,0.0 -256,32,7424,8192,torch.float8_e4m3fn,34,0,17.6324,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,220.75,3491.0,0.0 -256,32,8192,1024,torch.int8,79,0,5.6281,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,95.39,1589.46,0.0 -256,32,8192,1024,torch.float8_e4m3fn,79,0,6.5212,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,82.33,1371.78,0.0 -256,32,8192,2048,torch.int8,76,0,7.2889,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,147.31,2382.67,0.0 -256,32,8192,3584,torch.int8,34,0,9.4243,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,199.38,3183.17,0.0 -256,32,8192,7168,torch.int8,34,0,16.2902,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,230.7,3650.9,0.0 -256,32,8192,8192,torch.int8,34,0,17.4586,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,246.01,3888.93,0.0 -256,32,8192,8192,torch.float8_e4m3fn,34,0,18.0338,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,238.16,3764.89,0.0 -256,32,8192,28672,torch.int8,34,0,54.7382,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,274.62,4317.33,0.0 -256,32,9216,16384,torch.int8,30,0,34.6699,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,278.73,4387.35,0.0 -256,32,10240,8192,torch.int8,30,0,20.8953,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,256.93,4058.5,0.0 -256,32,10240,8192,torch.float8_e4m3fn,30,0,21.4049,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,250.82,3961.88,0.0 -256,32,12800,5120,torch.int8,30,0,14.9578,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,280.41,4447.11,0.0 -256,32,13312,16384,torch.int8,30,0,46.5921,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,299.59,4710.67,0.0 -256,32,16384,2048,torch.int8,30,0,10.4352,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,205.79,3322.27,0.0 -256,32,16384,4096,torch.int8,30,0,16.4693,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,260.79,4146.41,0.0 -256,32,16384,6656,torch.int8,30,0,22.4014,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,311.56,4924.4,0.0 -256,32,16384,8192,torch.int8,30,0,30.4324,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,282.26,4453.43,0.0 -256,32,16384,13312,torch.int8,30,0,39.8351,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,350.41,5512.18,0.0 -256,32,16384,26624,torch.int8,30,0,74.3742,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,375.36,5890.59,0.0 -256,32,26624,16384,torch.int8,30,0,89.3239,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,312.54,4908.38,0.0 -256,32,28672,8192,torch.float8_e4m3fn,23,0,47.2467,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,318.17,5015.76,0.0 -256,32,51200,5120,torch.int8,30,0,53.9026,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,311.25,4927.12,0.0 -256,32,53248,16384,torch.int8,34,0,168.8762,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,330.62,5189.29,0.0 -256,32,57344,8192,torch.int8,81,0,93.0943,a8w8_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_interwave_v2,322.95,5088.33,0.0 -256,32,57344,8192,torch.float8_e4m3fn,62,0,92.8953,a8w8_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_intrawave_v2,323.64,5099.23,0.0 -256,48,7424,8192,torch.float8_e4m3fn,74,2,20.2638,a8w8_rowwise_128x64x32x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,288.12,3055.86,0.1076 -256,64,100,5120,torch.int8,34,0,9.6716,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,6.78,88.14,0.0 -256,64,192,1024,torch.int8,77,0,4.1582,a8w8_rowwise_64x16x16x64_16x16_1x1_4x16x1_4x16x1_1x16x1x4_4x4x1_1x1_interwave_v2,6.05,68.95,0.0 -256,64,200,5120,torch.int8,34,0,9.9254,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,13.21,138.76,0.0 -256,64,800,5120,torch.int8,34,0,12.0241,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,43.6,376.42,0.0 -256,64,1024,8192,torch.float8_e4m3fn,34,2,9.0741,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,118.33,996.68,0.1083 -256,64,1280,8192,torch.int8,34,0,14.7691,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,90.88,756.57,0.0 -256,64,1280,8192,torch.float8_e4m3fn,79,2,11.0767,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,121.17,1008.77,0.1076 -256,64,2304,16384,torch.int8,34,0,23.2558,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,207.77,1680.97,0.0 -256,64,2560,8192,torch.int8,34,0,14.5967,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,183.9,1495.1,0.0 -256,64,3584,8192,torch.float8_e4m3fn,55,2,15.5912,a8w8_rowwise_128x64x32x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,241.04,1946.17,0.1078 -256,64,4608,16384,torch.int8,30,0,27.9735,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,345.46,2757.46,0.0 -256,64,5120,640,torch.int8,77,0,5.0956,a8w8_rowwise_64x16x16x64_16x16_1x1_4x16x1_4x16x1_1x16x1x4_4x4x1_1x1_interwave_v2,82.31,779.72,0.0 -256,64,5120,1280,torch.int8,76,0,6.4203,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,130.66,1135.6,0.0 -256,64,5120,3200,torch.int8,76,0,10.7687,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,194.75,1601.32,0.0 -256,64,5120,5120,torch.int8,30,0,11.7059,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,286.65,2323.4,0.0 -256,64,5120,6400,torch.int8,30,0,16.907,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,248.08,2001.12,0.0 -256,64,5120,25600,torch.int8,30,0,43.5801,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,384.97,3060.24,0.0 -256,64,6400,5120,torch.int8,30,0,12.4817,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,336.04,2717.17,0.0 -256,64,7168,8192,torch.int8,30,0,18.4286,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,407.85,3264.6,0.0 -256,64,7168,8192,torch.float8_e4m3fn,30,0,19.78,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,379.99,3041.56,0.0 -256,64,7424,8192,torch.float8_e4m3fn,30,0,20.8916,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,372.62,2981.68,0.0 -256,64,8192,1024,torch.float8_e4m3fn,79,0,7.3716,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,145.66,1289.1,0.0 -256,64,8192,2048,torch.int8,30,0,7.5798,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,283.32,2369.04,0.0 -256,64,8192,3584,torch.int8,30,0,11.279,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,333.19,2716.38,0.0 -256,64,8192,7168,torch.int8,30,0,17.495,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,429.62,3442.56,0.0 -256,64,8192,8192,torch.int8,30,0,19.4045,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,442.68,3539.47,0.0 -256,64,8192,8192,torch.float8_e4m3fn,30,0,19.8829,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,432.03,3454.31,0.0 -256,64,8192,28672,torch.int8,30,0,61.7622,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,486.78,3849.68,0.0 -256,64,9216,16384,torch.int8,23,0,41.0647,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,470.66,3731.26,0.0 -256,64,10240,8192,torch.int8,28,0,27.6213,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,388.74,3103.44,0.0 -256,64,10240,8192,torch.float8_e4m3fn,23,0,24.8931,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,431.34,3443.57,0.0 -256,64,12800,5120,torch.int8,23,0,18.5404,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,452.45,3640.81,0.0 -256,64,13312,16384,torch.int8,23,0,50.0251,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,558.07,4414.91,0.0 -256,64,16384,2048,torch.int8,23,0,11.4046,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,376.6,3137.56,0.0 -256,64,16384,4096,torch.int8,23,0,19.0989,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,449.76,3637.29,0.0 -256,64,16384,6656,torch.int8,23,0,26.4384,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,527.97,4220.19,0.0 -256,64,16384,8192,torch.int8,23,0,33.8597,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,507.38,4041.36,0.0 -256,64,16384,13312,torch.int8,23,0,46.8247,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,596.21,4720.86,0.0 -256,64,16384,26624,torch.int8,23,0,89.8884,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,621.15,4895.06,0.0 -256,64,26624,16384,torch.int8,23,0,91.2922,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,611.6,4826.96,0.0 -256,64,28672,8192,torch.float8_e4m3fn,23,0,52.7168,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,570.31,4535.09,0.0 -256,64,51200,5120,torch.int8,23,0,61.5287,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,545.35,4372.35,0.0 -256,64,53248,16384,torch.int8,23,0,184.3385,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,605.78,4775.34,0.0 -256,64,57344,8192,torch.int8,46,0,99.6635,a8w8_rowwise_256x64x128x128_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,603.33,4792.39,0.0 -256,64,57344,8192,torch.float8_e4m3fn,17,0,98.9474,a8w8_rowwise_256x64x224x128_16x16_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,607.69,4827.07,0.0 -256,128,100,5120,torch.int8,34,0,10.6683,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,12.29,111.82,0.0 -256,128,128,49920,torch.float8_e4m3fn,34,0,75.6047,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,21.64,169.46,0.0 -256,128,128,80000,torch.float8_e4m3fn,34,0,119.6585,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,21.91,171.43,0.0 -256,128,128,222336,torch.float8_e4m3fn,34,0,324.8197,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.43,175.33,0.0001 -256,128,128,254848,torch.float8_e4m3fn,34,0,371.2338,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.49,175.83,0.0002 -256,128,128,322816,torch.float8_e4m3fn,34,0,466.5241,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.67,177.21,0.0001 -256,128,128,423168,torch.float8_e4m3fn,34,0,611.0012,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.69,177.35,0.0002 -256,128,128,620160,torch.float8_e4m3fn,34,0,897.8064,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.63,176.87,0.0002 -256,128,128,659584,torch.float8_e4m3fn,34,0,953.0292,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.68,177.21,0.0002 -256,128,128,796544,torch.float8_e4m3fn,34,0,1151.3783,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.67,177.13,0.0004 -256,128,128,941696,torch.float8_e4m3fn,34,0,1362.3745,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.65,176.98,0.0002 -256,128,200,5120,torch.int8,34,0,9.652,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,27.16,179.3,0.0 -256,128,256,256,torch.float8_e4m3fn,68,0,3.1268,a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v1,5.37,52.4,0.0 -256,128,256,1024,torch.float8_e4m3fn,79,0,4.0962,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,16.38,111.99,0.0 -256,128,800,5120,torch.int8,34,0,11.5628,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,90.69,428.63,0.0 -256,128,1024,8192,torch.float8_e4m3fn,34,2,12.0372,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,178.4,805.78,0.1084 -256,128,1280,8192,torch.int8,34,0,13.9827,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,191.98,848.34,0.0 -256,128,1280,8192,torch.float8_e4m3fn,29,3,13.1684,a8w8_rowwise_256x32x96x256_16x16_1x3_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3,203.85,900.79,0.1595 -256,128,2304,16384,torch.int8,30,0,26.5035,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,364.62,1525.67,0.0 -256,128,2560,8192,torch.int8,30,0,16.5177,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,325.03,1372.8,0.0 -256,128,3584,8192,torch.float8_e4m3fn,30,0,17.8922,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,420.08,1750.83,0.0 -256,128,4608,16384,torch.int8,23,0,33.4047,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,578.58,2358.18,0.0 -256,128,5120,640,torch.int8,79,0,5.566,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,150.71,838.92,0.0 -256,128,5120,1280,torch.int8,23,0,7.6695,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,218.75,1046.76,0.0 -256,128,5120,3200,torch.int8,23,0,11.8363,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,354.36,1529.56,0.0 -256,128,5120,5120,torch.int8,23,0,14.1859,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,473.07,1986.51,0.0 -256,128,5120,6400,torch.int8,23,0,18.7826,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,446.62,1857.99,0.0 -256,128,5120,25600,torch.int8,23,0,54.0309,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,621.02,2510.78,0.0 -256,128,6400,5120,torch.int8,23,0,14.96,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,560.74,2343.7,0.0 -256,128,7168,8192,torch.int8,28,0,26.0097,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,577.95,2368.49,0.0 -256,128,7168,8192,torch.float8_e4m3fn,23,0,22.197,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,677.23,2775.32,0.0 -256,128,8192,1024,torch.int8,28,0,7.1186,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,301.67,1491.42,0.0 -256,128,8192,1024,torch.float8_e4m3fn,15,0,8.3628,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,256.79,1269.53,0.0 -256,128,8192,2048,torch.int8,23,0,10.2945,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,417.21,1858.91,0.0 -256,128,8192,3584,torch.int8,28,0,13.8708,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,541.87,2300.95,0.0 -256,128,8192,7168,torch.int8,28,0,24.2653,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,619.5,2544.16,0.0 -256,128,8192,8192,torch.int8,28,0,26.3513,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,651.96,2666.08,0.0 -256,128,8192,8192,torch.float8_e4m3fn,23,0,23.2545,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,738.78,3021.12,0.0 -256,128,8192,28672,torch.int8,28,0,87.5085,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,687.13,2750.0,0.0 -256,128,9216,16384,torch.int8,23,0,59.454,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,650.16,2614.65,0.0 -256,128,10240,8192,torch.int8,15,0,31.2683,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,686.79,2800.16,0.0 -256,128,10240,8192,torch.float8_e4m3fn,22,0,30.8204,a8w8_rowwise_256x64x96x256_16x16_2x3_16x16x1_16x16x1_1x64x1x4_8x8x1_2x1_intrawave_v3,696.77,2840.85,0.0 -256,128,12800,5120,torch.int8,23,0,25.5896,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,655.63,2714.7,0.0 -256,128,13312,16384,torch.int8,23,0,65.9651,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,846.43,3389.81,0.0 -256,128,16384,2048,torch.int8,23,0,14.5668,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,589.69,2609.42,0.0 -256,128,16384,4096,torch.int8,23,0,23.3525,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,735.68,3075.79,0.0 -256,128,16384,6656,torch.int8,23,0,34.0329,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,820.3,3352.58,0.0 -256,128,16384,8192,torch.int8,23,0,45.1385,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,761.21,3089.62,0.0 -256,128,16384,13312,torch.int8,23,0,61.6432,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,905.77,3633.85,0.0 -256,128,16384,26624,torch.int8,23,0,118.3409,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,943.62,3750.27,0.0 -256,128,26624,16384,torch.int8,23,0,129.1529,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,864.63,3446.46,0.0 -256,128,28672,8192,torch.float8_e4m3fn,41,0,60.0008,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1002.15,4054.44,0.0 -256,128,51200,5120,torch.int8,23,0,82.0559,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,817.84,3362.42,0.0 -256,128,53248,16384,torch.int8,23,0,230.1893,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,970.24,3858.32,0.0 -256,128,57344,8192,torch.int8,13,0,108.1045,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1112.43,4490.94,0.0 -256,128,57344,8192,torch.float8_e4m3fn,35,0,111.2293,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v4,1081.18,4364.77,0.0 -256,128,105984,128,torch.float8_e4m3fn,44,0,15.537,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,223.52,2620.47,0.0 -256,128,207616,128,torch.float8_e4m3fn,44,0,25.7878,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,263.81,3092.2,0.0 -256,128,270080,128,torch.float8_e4m3fn,44,0,32.2348,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,274.55,3217.86,0.0 -256,128,430336,128,torch.float8_e4m3fn,44,0,46.5205,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,303.12,3552.53,0.0 -256,128,450560,128,torch.float8_e4m3fn,44,0,48.3917,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,305.09,3575.64,0.0 -256,128,452352,128,torch.float8_e4m3fn,44,0,49.0249,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,302.35,3543.5,0.0 -256,128,691968,128,torch.float8_e4m3fn,44,0,71.2418,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,318.27,3730.0,0.0 -256,128,772736,128,torch.float8_e4m3fn,44,0,79.0658,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,320.25,3753.17,0.0 -256,128,855424,128,torch.float8_e4m3fn,44,0,86.4286,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,324.32,3800.82,0.0 -256,128,911744,128,torch.float8_e4m3fn,44,0,91.8719,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,325.19,3811.02,0.0 -256,192,1280,8192,torch.float8_e4m3fn,34,0,15.3959,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,261.53,815.16,0.0 -256,192,8192,1024,torch.float8_e4m3fn,21,0,8.6496,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,372.41,1356.24,0.0 -256,256,100,5120,torch.int8,34,0,10.9556,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,23.93,171.05,0.0 -256,256,200,5120,torch.int8,34,0,9.8147,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,53.42,248.31,0.0 -256,256,256,128,torch.float8_e4m3fn,49,0,2.7541,a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_intrawave_v1,6.09,71.39,0.0 -256,256,512,640,torch.float8_e4m3fn,79,0,3.6844,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,45.54,204.56,0.0 -256,256,800,5120,torch.int8,34,0,11.8423,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,177.09,491.15,0.0 -256,256,1024,8192,torch.float8_e4m3fn,34,0,14.7059,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,292.06,748.68,0.0 -256,256,1280,8192,torch.int8,30,0,16.2863,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,329.65,812.85,0.0 -256,256,1280,8192,torch.float8_e4m3fn,30,0,17.037,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,315.12,777.03,0.0 -256,256,2304,16384,torch.int8,23,0,32.4817,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,595.02,1327.6,0.0 -256,256,2560,8192,torch.int8,28,0,23.112,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,464.58,1054.84,0.0 -256,256,3584,8192,torch.float8_e4m3fn,23,0,20.9257,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,718.37,1590.98,0.0 -256,256,4608,16384,torch.int8,23,0,55.0189,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,702.57,1491.33,0.0 -256,256,5120,640,torch.int8,81,0,6.6084,a8w8_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_interwave_v2,253.88,917.33,0.0 -256,256,5120,1280,torch.int8,15,0,8.2757,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,405.46,1148.27,0.0 -256,256,5120,3200,torch.int8,15,0,15.7837,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,531.47,1256.02,0.0 -256,256,5120,5120,torch.int8,23,0,21.8405,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,614.54,1380.31,0.0 -256,256,5120,6400,torch.int8,21,0,23.7832,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,705.42,1556.89,0.0 -256,256,5120,25600,torch.int8,23,0,86.0166,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,780.19,1630.46,0.0 -256,256,6400,5120,torch.int8,23,0,22.7861,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,736.29,1639.4,0.0 -256,256,7168,8192,torch.int8,15,0,30.291,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,992.53,2128.93,0.0 -256,256,7168,8192,torch.float8_e4m3fn,15,0,30.0111,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1001.79,2148.79,0.0 -256,256,8192,1024,torch.int8,21,0,8.6043,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,499.17,1492.86,0.0 -256,256,8192,1024,torch.float8_e4m3fn,15,0,9.1986,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,466.92,1396.41,0.0 -256,256,8192,2048,torch.int8,15,0,10.9387,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,785.28,1965.12,0.0 -256,256,8192,3584,torch.int8,15,0,16.5846,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,906.41,2078.55,0.0 -256,256,8192,7168,torch.int8,21,0,29.138,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1031.81,2222.17,0.0 -256,256,8192,8192,torch.int8,15,0,31.6256,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1086.45,2320.91,0.0 -256,256,8192,8192,torch.float8_e4m3fn,15,0,31.0706,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1105.86,2362.37,0.0 -256,256,8192,28672,torch.int8,15,0,102.0397,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1178.55,2414.9,0.0 -256,256,9216,16384,torch.int8,23,0,86.8248,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,890.41,1841.73,0.0 -256,256,10240,8192,torch.int8,41,0,42.9954,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,998.94,2121.76,0.0 -256,256,10240,8192,torch.float8_e4m3fn,14,0,37.6153,a8w8_rowwise_256x128x96x256_32x32_1x3_16x16x1_16x16x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1141.81,2425.24,0.0 -256,256,12800,5120,torch.int8,15,0,42.5463,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,788.66,1725.19,0.0 -256,256,13312,16384,torch.int8,23,0,113.4219,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,984.55,2020.01,0.0 -256,256,16384,2048,torch.int8,15,0,22.0982,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,777.43,1921.76,0.0 -256,256,16384,4096,torch.int8,15,0,39.8495,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,862.24,1920.88,0.0 -256,256,16384,6656,torch.int8,23,0,57.6756,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,968.08,2065.77,0.0 -256,256,16384,8192,torch.int8,15,0,72.9104,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,942.52,1984.68,0.0 -256,256,16384,13312,torch.int8,23,0,106.2833,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1050.67,2163.09,0.0 -256,256,16384,26624,torch.int8,23,0,202.2465,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1104.29,2231.99,0.0 -256,256,26624,16384,torch.int8,23,0,206.2403,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1082.9,2201.48,0.0 -256,256,28672,8192,torch.float8_e4m3fn,4,0,80.7001,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1490.2,3118.44,0.0 -256,256,51200,5120,torch.int8,40,0,120.9285,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1109.89,2395.38,0.0 -256,256,53248,16384,torch.int8,40,0,368.0253,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1213.71,2456.01,0.0 -256,256,57344,8192,torch.int8,2,0,135.372,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1776.72,3702.53,0.0 -256,256,57344,8192,torch.float8_e4m3fn,2,0,140.7498,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1708.83,3561.07,0.0 -256,320,1280,8192,torch.float8_e4m3fn,30,0,17.4934,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,383.62,796.09,0.0 -256,320,8192,1024,torch.float8_e4m3fn,62,0,12.0692,a8w8_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_intrawave_v2,444.83,1156.59,0.0 -256,384,8448,30080,torch.float8_e4m3fn,41,0,147.1478,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1326.29,1849.53,0.0 -256,512,100,5120,torch.int8,34,0,11.1055,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,47.21,291.37,0.0 -256,512,200,5120,torch.int8,34,0,11.8774,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,88.28,324.17,0.0 -256,512,800,5120,torch.int8,30,0,13.1908,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,317.97,571.36,0.0 -256,512,1280,8192,torch.int8,28,0,23.2059,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,462.7,689.08,0.0 -256,512,1280,8192,torch.float8_e4m3fn,23,0,21.2265,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,505.85,753.34,0.0 -256,512,1792,7424,torch.float8_e4m3fn,23,0,20.9911,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,648.99,902.28,0.0 -256,512,1920,6784,torch.float8_e4m3fn,23,0,19.552,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,682.18,944.39,0.0 -256,512,2304,16384,torch.int8,23,0,54.2058,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,713.11,894.68,0.0 -256,512,2560,8192,torch.int8,21,0,27.6646,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,776.26,1004.43,0.0 -256,512,4608,16384,torch.int8,23,0,87.1647,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,886.93,1016.52,0.0 -256,512,5120,640,torch.int8,47,0,8.4036,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,399.29,1052.81,0.0 -256,512,5120,1280,torch.int8,22,0,12.2285,a8w8_rowwise_256x64x96x256_16x16_2x3_16x16x1_16x16x1_1x64x1x4_8x8x1_2x1_intrawave_v3,548.79,1018.26,0.0 -256,512,5120,3200,torch.int8,13,0,21.8498,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,767.84,1064.78,0.0 -256,512,5120,5120,torch.int8,23,0,33.1376,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,810.06,1028.4,0.0 -256,512,5120,6400,torch.int8,15,0,38.8017,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,864.77,1064.07,0.0 -256,512,5120,25600,torch.int8,23,0,131.897,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1017.6,1132.87,0.0 -256,512,6400,5120,torch.int8,15,0,41.3468,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,811.54,1014.42,0.0 -256,512,7168,8192,torch.int8,41,0,47.5197,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1265.36,1478.43,0.0 -256,512,8192,1024,torch.int8,13,0,11.7749,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,729.51,1469.35,0.0 -256,512,8192,1024,torch.float8_e4m3fn,13,0,12.6356,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,679.82,1369.27,0.0 -256,512,8192,2048,torch.int8,41,0,16.8904,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1017.14,1552.03,0.0 -256,512,8192,3584,torch.int8,41,0,24.9561,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1204.71,1586.14,0.0 -256,512,8192,7168,torch.int8,41,0,44.168,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1361.38,1602.49,0.0 -256,512,8192,8192,torch.int8,13,0,51.209,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1341.94,1556.21,0.0 -256,512,8192,28672,torch.int8,41,0,150.8316,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1594.61,1710.18,0.0 -256,512,9216,16384,torch.int8,23,0,141.9471,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1089.27,1189.32,0.0 -256,512,10240,8192,torch.int8,11,0,61.9597,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1386.37,1590.81,0.0 -256,512,12800,5120,torch.int8,23,0,74.3061,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,903.14,1093.65,0.0 -256,512,13312,16384,torch.int8,23,0,199.0315,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1122.13,1206.46,0.0 -256,512,16384,2048,torch.int8,15,0,38.4305,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,894.07,1336.96,0.0 -256,512,16384,4096,torch.int8,15,0,65.4454,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1050.03,1313.82,0.0 -256,512,16384,6656,torch.int8,15,0,102.3241,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1091.33,1263.02,0.0 -256,512,16384,8192,torch.int8,15,0,123.6929,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1111.13,1254.63,0.0 -256,512,16384,13312,torch.int8,15,0,191.2442,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1167.82,1263.81,0.0 -256,512,16384,26624,torch.int8,15,0,365.7012,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1221.43,1275.95,0.0 -256,512,26624,16384,torch.int8,15,0,349.3418,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1278.62,1350.71,0.0 -256,512,51200,5120,torch.int8,40,0,223.8159,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1199.36,1417.21,0.0 -256,512,53248,16384,torch.int8,15,0,651.6344,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1370.94,1435.36,0.0 -256,512,57344,8192,torch.int8,2,0,233.9286,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2056.34,2277.09,0.0 -256,640,128,128,torch.float8_e4m3fn,50,0,2.799,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,7.49,93.66,0.0 -256,640,256,768,torch.float8_e4m3fn,79,0,3.9362,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,63.93,258.07,0.0 -256,640,384,128,torch.float8_e4m3fn,49,0,2.9702,a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_intrawave_v1,21.18,209.61,0.0 -256,640,768,896,torch.float8_e4m3fn,76,0,5.2439,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,167.97,428.04,0.0 -256,1024,100,5120,torch.int8,34,0,11.6746,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,89.82,510.48,0.0 -256,1024,200,5120,torch.int8,34,0,12.0055,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,174.68,556.12,0.0 -256,1024,800,5120,torch.int8,23,0,15.5235,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,540.38,707.14,0.0 -256,1024,1280,8192,torch.int8,21,0,27.7294,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,774.44,775.2,0.0 -256,1024,1280,8192,torch.float8_e4m3fn,21,0,28.4599,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,754.56,755.3,0.0 -256,1024,2304,16384,torch.int8,23,0,86.7135,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,891.55,683.22,0.0 -256,1024,2560,8192,torch.int8,41,0,42.1202,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1019.69,821.53,0.0 -256,1024,4608,16384,torch.int8,23,0,143.2677,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1079.23,709.94,0.0 -256,1024,5120,640,torch.int8,11,0,11.7704,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,570.15,1224.93,0.0 -256,1024,5120,1280,torch.int8,11,0,17.5871,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,763.16,1043.38,0.0 -256,1024,5120,3200,torch.int8,11,0,27.688,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1211.88,1088.8,0.0 -256,1024,5120,5120,torch.int8,23,0,55.7649,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,962.74,752.14,0.0 -256,1024,5120,6400,torch.int8,11,0,57.5786,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1165.52,865.03,0.0 -256,1024,5120,25600,torch.int8,23,0,227.1849,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1181.57,738.48,0.0 -256,1024,6400,5120,torch.int8,23,0,72.0506,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,931.41,709.47,0.0 -256,1024,7168,8192,torch.int8,13,0,82.6278,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1455.43,989.85,0.0 -256,1024,8192,1024,torch.int8,13,0,17.3848,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,988.21,1507.89,0.0 -256,1024,8192,1024,torch.float8_e4m3fn,13,0,18.4495,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,931.18,1420.87,0.0 -256,1024,8192,2048,torch.int8,41,0,27.2285,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1261.9,1309.35,0.0 -256,1024,8192,3584,torch.int8,13,0,43.9928,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1366.8,1132.17,0.0 -256,1024,8192,7168,torch.int8,13,0,79.3336,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1515.87,1044.17,0.0 -256,1024,8192,8192,torch.int8,13,0,88.9386,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1545.32,1037.51,0.0 -256,1024,8192,28672,torch.int8,41,0,274.6402,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1751.51,1023.22,0.0 -256,1024,9216,16384,torch.int8,15,0,243.9282,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1267.74,765.17,0.0 -256,1024,10240,8192,torch.int8,4,0,89.391,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1921.88,1266.86,0.0 -256,1024,12800,5120,torch.int8,40,0,104.2861,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1287.01,930.07,0.0 -256,1024,13312,16384,torch.int8,40,0,306.5211,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1457.25,855.22,0.0 -256,1024,16384,2048,torch.int8,1,0,61.8687,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1110.73,1118.59,0.0 -256,1024,16384,4096,torch.int8,1,0,108.4683,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1267.09,966.71,0.0 -256,1024,16384,6656,torch.int8,1,0,165.4754,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1349.68,902.99,0.0 -256,1024,16384,8192,torch.int8,1,0,202.2384,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1359.18,871.05,0.0 -256,1024,16384,13312,torch.int8,1,0,307.7016,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1451.66,862.17,0.0 -256,1024,16384,26624,torch.int8,1,0,603.8507,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1479.43,823.09,0.0 -256,1024,26624,16384,torch.int8,15,0,630.2527,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1417.45,805.25,0.0 -256,1024,51200,5120,torch.int8,39,0,396.7395,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1353.21,938.26,0.0 -256,1024,53248,16384,torch.int8,40,0,1195.3563,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1494.71,835.1,0.0 -256,1024,57344,8192,torch.int8,2,0,438.8629,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2192.19,1357.12,0.0 -256,2048,100,5120,torch.int8,34,0,11.6595,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,179.87,978.37,0.0 -256,2048,200,5120,torch.int8,30,0,13.0876,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,320.48,942.03,0.0 -256,2048,800,5120,torch.int8,23,0,24.3923,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,687.81,732.14,0.0 -256,2048,1280,8192,torch.int8,41,0,42.0169,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1022.2,773.64,0.0 -256,2048,1280,8192,torch.float8_e4m3fn,14,0,36.1844,a8w8_rowwise_256x128x96x256_32x32_1x3_16x16x1_16x16x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1186.97,898.34,0.0 -256,2048,2304,16384,torch.int8,23,0,145.2155,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1064.75,556.0,0.0 -256,2048,2560,8192,torch.int8,11,0,61.0366,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1407.34,790.26,0.0 -256,2048,4608,16384,torch.int8,11,0,247.0665,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1251.64,517.78,0.0 -256,2048,5120,640,torch.int8,10,0,17.6749,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,759.37,1446.06,0.0 -256,2048,5120,1280,torch.int8,4,0,27.4475,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,978.0,1098.34,0.0 -256,2048,5120,3200,torch.int8,4,0,44.9517,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1492.91,976.81,0.0 -256,2048,5120,5120,torch.int8,15,0,96.048,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1117.92,600.45,0.0 -256,2048,5120,6400,torch.int8,4,0,90.0028,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1491.26,742.72,0.0 -256,2048,5120,25600,torch.int8,15,0,419.2679,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1280.5,487.69,0.0 -256,2048,6400,5120,torch.int8,39,0,107.5421,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1248.05,645.96,0.0 -256,2048,7168,8192,torch.int8,2,0,116.6871,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2061.22,898.62,0.0 -256,2048,8192,1024,torch.int8,1,0,29.7892,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1153.43,1478.39,0.0 -256,2048,8192,1024,torch.float8_e4m3fn,43,0,31.5009,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,1090.75,1398.06,0.0 -256,2048,8192,2048,torch.int8,1,0,42.7444,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1607.68,1275.63,0.0 -256,2048,8192,3584,torch.int8,1,0,65.1518,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1845.83,1078.32,0.0 -256,2048,8192,7168,torch.int8,1,0,113.2102,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2124.53,944.74,0.0 -256,2048,8192,8192,torch.int8,1,0,125.3472,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2192.93,936.92,0.0 -256,2048,8192,28672,torch.int8,1,0,387.0603,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2485.59,845.23,0.0 -256,2048,9216,16384,torch.int8,15,0,431.0243,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1434.9,515.74,0.0 -256,2048,10240,8192,torch.int8,4,0,170.1034,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2019.93,838.35,0.0 -256,2048,12800,5120,torch.int8,39,0,188.6903,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1422.62,680.75,0.0 -256,2048,13312,16384,torch.int8,40,0,609.7983,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1465.0,502.11,0.0 -256,2048,16384,2048,torch.int8,1,0,109.435,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1255.9,958.17,0.0 -256,2048,16384,4096,torch.int8,1,0,192.1657,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1430.42,742.1,0.0 -256,2048,16384,6656,torch.int8,1,0,297.4271,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1501.8,638.11,0.0 -256,2048,16384,8192,torch.int8,1,0,351.3779,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1564.57,620.71,0.0 -256,2048,16384,13312,torch.int8,1,0,562.4285,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1588.39,555.58,0.0 -256,2048,16384,26624,torch.int8,1,0,1159.8861,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1540.42,480.95,0.0 -256,2048,26624,16384,torch.int8,40,0,1145.701,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1559.49,505.21,0.0 -256,2048,51200,5120,torch.int8,39,0,756.332,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1419.67,637.74,0.0 -256,2048,53248,16384,torch.int8,1,0,2196.6099,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1626.79,511.73,0.0 -256,2048,57344,8192,torch.int8,1,0,814.7053,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2361.77,885.5,0.0 -256,4096,100,5120,torch.int8,34,0,16.1651,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,259.47,1379.68,0.0 -256,4096,200,5120,torch.int8,23,0,15.2341,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,550.65,1551.38,0.0 -256,4096,800,5120,torch.int8,15,0,36.4711,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,920.03,867.02,0.0 -256,4096,1280,8192,torch.int8,11,0,59.932,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1433.28,909.8,0.0 -256,4096,1280,8192,torch.float8_e4m3fn,11,0,51.722,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1660.79,1054.21,0.0 -256,4096,2304,16384,torch.int8,11,0,246.424,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1254.9,502.11,0.0 -256,4096,2560,8192,torch.int8,4,0,90.7509,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1893.08,831.92,0.0 -256,4096,4608,16384,torch.int8,15,0,433.3975,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1427.04,416.14,0.0 -256,4096,5120,640,torch.int8,13,0,32.4583,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,827.02,1473.93,0.0 -256,4096,5120,1280,torch.int8,13,0,50.6317,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1060.35,1061.38,0.0 -256,4096,5120,3200,torch.int8,4,0,86.6115,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1549.65,824.77,0.0 -256,4096,5120,5120,torch.int8,15,0,177.0814,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1212.71,503.32,0.0 -256,4096,5120,6400,torch.int8,4,0,162.2341,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1654.62,622.1,0.0 -256,4096,5120,25600,torch.int8,15,0,782.4893,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1372.21,355.11,0.0 -256,4096,6400,5120,torch.int8,39,0,188.523,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1423.89,563.16,0.0 -256,4096,7168,8192,torch.int8,2,0,219.2844,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2193.66,688.58,0.0 -256,4096,7424,8192,torch.float8_e4m3fn,1,0,232.7657,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2140.42,666.72,0.0 -256,4096,8192,1024,torch.int8,1,0,55.6305,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1235.28,1432.52,0.0 -256,4096,8192,1024,torch.float8_e4m3fn,43,0,59.9656,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,1145.98,1328.96,0.0 -256,4096,8192,2048,torch.int8,1,0,83.0059,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1655.77,1111.66,0.0 -256,4096,8192,3584,torch.int8,1,0,122.3648,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1965.58,908.34,0.0 -256,4096,8192,7168,torch.int8,1,0,215.6239,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2230.9,719.72,0.0 -256,4096,8192,8192,torch.int8,1,0,240.0438,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2290.23,698.92,0.0 -256,4096,8192,28672,torch.int8,1,0,764.1022,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2518.18,548.92,0.0 -256,4096,9216,16384,torch.int8,39,0,778.6417,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1588.6,377.07,0.0 -256,4096,10240,8192,torch.int8,1,0,318.603,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2156.9,631.9,0.0 -256,4096,12800,5120,torch.int8,39,0,356.6531,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1505.3,536.56,0.0 -256,4096,13312,16384,torch.int8,10,0,1114.1158,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1603.7,353.88,0.0 -256,4096,16384,2048,torch.int8,1,0,205.0841,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1340.32,858.97,0.0 -256,4096,16384,4096,torch.int8,39,0,353.73,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1554.17,616.58,0.0 -256,4096,16384,6656,torch.int8,39,0,561.0198,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1592.37,482.22,0.0 -256,4096,16384,8192,torch.int8,39,0,657.0519,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1673.4,459.61,0.0 -256,4096,16384,13312,torch.int8,1,0,1080.1678,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1654.1,376.65,0.0 -256,4096,16384,26624,torch.int8,1,0,2197.8702,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1625.85,309.15,0.0 -256,4096,26624,16384,torch.int8,10,0,2199.7579,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1624.46,327.95,0.0 -256,4096,51200,5120,torch.int8,1,0,1418.8126,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1513.58,495.16,0.0 -256,4096,53248,16384,torch.int8,1,0,4075.2597,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1753.71,337.58,0.0 -256,4096,57344,8192,torch.int8,1,0,1617.8989,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2378.57,601.45,0.0 -256,5120,7424,8192,torch.float8_e4m3fn,2,0,308.3064,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2019.97,579.88,0.0 -256,8192,100,5120,torch.int8,29,0,27.2037,a8w8_rowwise_256x32x96x256_16x16_1x3_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3,308.36,1620.86,0.0 -256,8192,200,5120,torch.int8,21,0,25.2734,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,663.83,1829.74,0.0 -256,8192,800,5120,torch.int8,41,0,65.9722,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1017.23,896.53,0.0 -256,8192,1280,8192,torch.int8,4,0,95.3807,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1801.19,1033.4,0.0 -256,8192,1280,8192,torch.float8_e4m3fn,4,0,81.7243,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2102.17,1206.08,0.0 -256,8192,2304,16384,torch.int8,15,0,463.8213,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1333.43,452.15,0.0 -256,8192,2560,8192,torch.int8,4,0,173.6356,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1978.84,748.83,0.0 -256,8192,4608,16384,torch.int8,39,0,844.5637,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1464.6,337.7,0.0 -256,8192,5120,640,torch.int8,42,0,55.1814,a8w8_rowwise_256x128x256x64_32x32_2x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,972.92,1674.58,0.0 -256,8192,5120,1280,torch.int8,39,0,86.4091,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1242.63,1168.0,0.0 -256,8192,5120,3200,torch.int8,40,0,149.3935,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1796.83,846.65,0.0 -256,8192,5120,5120,torch.int8,39,0,277.9848,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1545.04,546.95,0.0 -256,8192,5120,6400,torch.int8,40,0,276.3348,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1942.83,611.88,0.0 -256,8192,5120,25600,torch.int8,39,0,1513.1737,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1419.19,280.65,0.0 -256,8192,6400,5120,torch.int8,39,0,359.0195,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1495.38,500.16,0.0 -256,8192,7168,8192,torch.int8,2,0,420.8889,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2285.81,577.99,0.0 -256,8192,7424,8192,torch.float8_e4m3fn,1,0,441.2013,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2258.45,565.64,0.0 -256,8192,8192,1024,torch.int8,1,0,105.9654,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1297.02,1424.95,0.0 -256,8192,8192,1024,torch.float8_e4m3fn,10,0,119.8588,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1146.67,1259.77,0.0 -256,8192,8192,2048,torch.int8,1,0,159.9883,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1718.11,1048.65,0.0 -256,8192,8192,3584,torch.int8,1,0,232.9396,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2065.07,828.27,0.0 -256,8192,8192,7168,torch.int8,1,0,412.978,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2329.6,609.37,0.0 -256,8192,8192,8192,torch.int8,1,0,469.2882,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2342.93,572.01,0.0 -256,8192,8192,28672,torch.int8,1,0,1532.6908,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2510.81,394.06,0.0 -256,8192,9216,16384,torch.int8,43,0,1557.3892,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,1588.49,280.09,0.0 -256,8192,10240,8192,torch.int8,1,0,575.9516,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2386.29,553.46,0.0 -256,8192,12800,5120,torch.int8,39,0,683.3672,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1571.25,464.16,0.0 -256,8192,13312,16384,torch.int8,40,0,2189.1084,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1632.36,260.57,0.0 -256,8192,16384,2048,torch.int8,39,0,392.4492,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1400.83,812.25,0.0 -256,8192,16384,4096,torch.int8,39,0,678.4652,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1620.59,544.02,0.0 -256,8192,16384,6656,torch.int8,39,0,1094.0621,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1633.09,394.87,0.0 -256,8192,16384,8192,torch.int8,1,0,1288.1354,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1707.14,364.68,0.0 -256,8192,16384,13312,torch.int8,1,0,2091.5748,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1708.48,284.76,0.0 -256,8192,16384,26624,torch.int8,1,0,4736.0103,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1509.04,194.84,0.0 -256,8192,26624,16384,torch.int8,40,0,4074.1172,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1754.2,247.08,0.0 -256,8192,51200,5120,torch.int8,1,0,2639.7599,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1627.03,432.97,0.0 -256,8192,53248,16384,torch.int8,40,0,8028.0784,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1780.46,234.06,0.0 -256,8192,57344,8192,torch.int8,1,0,3164.0659,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2432.5,466.61,0.0 -256,16384,100,5120,torch.int8,27,0,42.6746,a8w8_rowwise_256x32x160x256_16x16_1x5_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3,393.14,2054.5,0.0 -256,16384,200,5120,torch.int8,21,0,44.0772,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,761.27,2075.08,0.0 -256,16384,800,5120,torch.int8,1,0,111.2384,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1206.58,1026.59,0.0 -256,16384,1280,8192,torch.int8,4,0,178.1119,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1929.11,1047.92,0.0 -256,16384,1280,8192,torch.float8_e4m3fn,4,0,174.2513,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1971.85,1071.13,0.0 -256,16384,2304,16384,torch.int8,15,0,878.7062,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1407.7,434.37,0.0 -256,16384,2560,8192,torch.int8,1,0,329.1334,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2087.89,726.38,0.0 -256,16384,4608,16384,torch.int8,40,0,1632.4516,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1515.45,303.18,0.0 -256,16384,5120,640,torch.int8,43,0,99.8836,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,1074.99,1817.46,0.0 -256,16384,5120,1280,torch.int8,1,0,153.6015,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1398.09,1271.45,0.0 -256,16384,5120,3200,torch.int8,1,0,271.2735,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1979.08,872.13,0.0 -256,16384,5120,5120,torch.int8,1,0,518.1604,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1657.78,536.27,0.0 -256,16384,5120,6400,torch.int8,1,0,543.7765,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1974.6,561.62,0.0 -256,16384,5120,25600,torch.int8,1,0,2869.8056,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1496.61,250.29,0.0 -256,16384,6400,5120,torch.int8,40,0,690.0605,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1556.01,472.96,0.0 -256,16384,7168,8192,torch.int8,1,0,815.6995,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2358.89,524.48,0.0 -256,16384,8192,1024,torch.int8,10,0,198.2601,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1386.45,1480.89,0.0 -256,16384,8192,1024,torch.float8_e4m3fn,10,0,221.8588,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1238.98,1323.37,0.0 -256,16384,8192,2048,torch.int8,1,0,299.7045,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1834.33,1063.6,0.0 -256,16384,8192,3584,torch.int8,1,0,463.2471,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2076.8,769.6,0.0 -256,16384,8192,7168,torch.int8,1,0,819.3844,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2348.28,542.6,0.0 -256,16384,8192,8192,torch.int8,1,0,933.9446,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2354.55,502.99,0.0 -256,16384,8192,28672,torch.int8,1,0,3088.0838,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2492.35,315.11,0.0 -256,16384,9216,16384,torch.int8,1,0,2949.0967,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1677.73,244.62,0.0 -256,16384,10240,8192,torch.int8,1,0,1149.3883,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2391.51,481.69,0.0 -256,16384,12800,5120,torch.int8,39,0,1303.8826,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1646.99,436.28,0.0 -256,16384,13312,16384,torch.int8,1,0,4134.6494,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1728.52,223.17,0.0 -256,16384,16384,2048,torch.int8,1,0,719.1952,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1528.81,839.8,0.0 -256,16384,16384,4096,torch.int8,39,0,1327.6342,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1656.35,505.48,0.0 -256,16384,16384,6656,torch.int8,39,0,2120.7951,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1684.94,355.99,0.0 -256,16384,16384,8192,torch.int8,1,0,2555.2385,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1721.19,315.16,0.0 -256,16384,16384,13312,torch.int8,1,0,4157.4955,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1719.02,234.05,0.0 -256,16384,16384,26624,torch.int8,1,0,9368.503,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1525.71,150.43,0.0 -256,16384,26624,16384,torch.int8,40,0,8090.2903,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1766.77,194.93,0.0 -256,16384,51200,5120,torch.int8,1,0,5290.8019,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1623.56,382.5,0.0 -256,16384,53248,16384,torch.int8,40,0,15996.8357,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1787.06,180.39,0.0 -256,16384,57344,8192,torch.int8,1,0,6467.762,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2379.98,383.91,0.0 -256,32768,100,5120,torch.int8,27,0,78.5325,a8w8_rowwise_256x32x160x256_16x16_1x5_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3,427.27,2226.31,0.0 -256,32768,200,5120,torch.int8,21,0,82.0643,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,817.76,2216.6,0.0 -256,32768,800,5120,torch.int8,1,0,216.0528,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1242.45,1038.16,0.0 -256,32768,1280,8192,torch.int8,1,0,339.8668,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2021.95,1067.5,0.0 -256,32768,2304,16384,torch.int8,1,0,1717.9546,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1440.03,422.37,0.0 -256,32768,2560,8192,torch.int8,1,0,599.612,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2292.13,762.46,0.0 -256,32768,4608,16384,torch.int8,40,0,3026.9648,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1634.58,302.07,0.0 -256,32768,5120,640,torch.int8,43,0,191.0117,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,1124.27,1883.62,0.0 -256,32768,5120,1280,torch.int8,1,0,299.0856,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1436.03,1284.05,0.0 -256,32768,5120,3200,torch.int8,1,0,538.4921,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1993.98,848.27,0.0 -256,32768,5120,5120,torch.int8,40,0,999.0598,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1719.6,530.03,0.0 -256,32768,5120,6400,torch.int8,1,0,1028.2651,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2088.45,562.14,0.0 -256,32768,5120,25600,torch.int8,1,0,5734.5832,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1497.92,227.65,0.0 -256,32768,6400,5120,torch.int8,40,0,1274.1106,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1685.48,486.59,0.0 -256,32768,7168,8192,torch.int8,1,0,1620.6254,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2374.57,491.73,0.0 -256,32768,8192,1024,torch.int8,10,0,379.9256,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1447.01,1523.49,0.0 -256,32768,8192,2048,torch.int8,1,0,594.1387,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1850.6,1044.8,0.0 -256,32768,8192,3584,torch.int8,1,0,899.1908,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2139.86,760.32,0.0 -256,32768,8192,7168,torch.int8,1,0,1640.4404,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2345.89,506.25,0.0 -256,32768,8192,8192,torch.int8,1,0,1837.7153,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2393.21,474.73,0.0 -256,32768,8192,28672,torch.int8,1,0,6209.7387,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2478.87,275.58,0.0 -256,32768,9216,16384,torch.int8,1,0,5884.19,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1681.73,219.55,0.0 -256,32768,10240,8192,torch.int8,1,0,2299.333,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2390.94,445.09,0.0 -256,32768,12800,5120,torch.int8,39,0,2481.3683,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1730.89,432.09,0.0 -256,32768,13312,16384,torch.int8,40,0,8230.5195,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1736.66,197.73,0.0 -256,32768,16384,2048,torch.int8,1,0,1409.7173,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1559.9,833.08,0.0 -256,32768,16384,4096,torch.int8,39,0,2546.0869,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1727.37,500.8,0.0 -256,32768,16384,6656,torch.int8,39,0,4073.3594,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1754.53,343.92,0.0 -256,32768,16384,8192,torch.int8,39,0,4965.434,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1771.47,297.33,0.0 -256,32768,16384,13312,torch.int8,1,0,8271.2663,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1728.11,208.92,0.0 -256,32768,16384,26624,torch.int8,1,0,19064.3766,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1499.51,124.96,0.0 -256,32768,26624,16384,torch.int8,40,0,16029.2644,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1783.44,169.56,0.0 -256,32768,51200,5120,torch.int8,1,0,10610.0611,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1619.21,356.77,0.0 -256,32768,53248,16384,torch.int8,40,0,31823.1958,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1796.63,153.94,0.0 -256,32768,57344,8192,torch.int8,1,0,12840.4952,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2397.6,350.17,0.0 -256,35200,256,19968,torch.float8_e4m3fn,1,0,245.3664,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1466.67,2958.87,0.0 +gfx,cu_num,M,N,K,q_dtype_w,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx942,80,1,1280,8192,torch.int8,34,0,20.5611,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1.02,510.5,0.0 +gfx942,80,1,8192,1024,torch.int8,78,0,9.1327,a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,1.84,920.43,0.0 +gfx942,80,32,1280,8192,torch.int8,34,0,19.9841,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,33.58,541.92,0.0 +gfx942,80,32,8192,1024,torch.int8,28,0,9.9382,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,54.02,900.13,0.0 +gfx942,80,64,1280,8192,torch.int8,34,0,19.6589,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,68.27,568.39,0.0 +gfx942,80,64,8192,1024,torch.int8,21,0,13.6554,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,78.63,695.89,0.0 +gfx942,80,128,1280,8192,torch.int8,30,0,23.3376,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,115.02,508.28,0.0 +gfx942,80,128,8192,1024,torch.int8,45,0,20.3684,a8w8_rowwise_256x128x64x128_32x32_2x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,105.43,521.24,0.0 +gfx942,80,192,1280,8192,torch.int8,23,0,33.0028,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,122.01,380.28,0.0 +gfx942,80,192,8192,1024,torch.int8,47,0,26.1312,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,123.27,448.92,0.0 +gfx942,80,256,1280,8192,torch.int8,23,0,33.3204,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,161.12,397.3,0.0 +gfx942,80,256,8192,1024,torch.int8,13,0,29.7436,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,144.4,431.86,0.0 +gfx942,80,320,1280,8192,torch.int8,21,0,54.4822,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,123.18,255.61,0.0 +gfx942,80,320,8192,1024,torch.int8,47,0,39.6412,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,135.43,352.14,0.0 +gfx942,80,512,1280,8192,torch.int8,21,0,55.0378,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,195.09,290.54,0.0 +gfx942,80,512,8192,1024,torch.int8,13,0,52.947,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,162.24,326.77,0.0 +gfx942,80,1024,1280,8192,torch.int8,13,0,88.6516,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,242.24,242.48,0.0 +gfx942,80,1024,8192,1024,torch.int8,13,0,90.2628,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,190.33,290.42,0.0 +gfx942,80,2048,1280,8192,torch.int8,13,0,140.7575,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,305.13,230.94,0.0 +gfx942,80,2048,8192,1024,torch.int8,41,0,165.9552,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,207.04,265.37,0.0 +gfx942,80,4096,1280,8192,torch.int8,13,0,275.3109,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,312.01,198.05,0.0 +gfx942,80,4096,8192,1024,torch.int8,13,0,305.9089,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,224.64,260.51,0.0 +gfx942,80,8192,1280,8192,torch.int8,13,0,545.0371,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,315.21,180.84,0.0 +gfx942,80,8192,8192,1024,torch.int8,13,0,598.5803,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,229.61,252.26,0.0 +gfx942,80,16384,1280,8192,torch.int8,13,0,1075.9373,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,319.35,173.47,0.0 +gfx942,80,16384,8192,1024,torch.int8,13,0,1176.7808,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,233.58,249.5,0.0 +gfx950,256,1,100,5120,torch.int8,34,0,8.7502,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.12,59.12,0.0 +gfx950,256,1,200,5120,torch.int8,34,0,9.0193,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.23,114.15,0.0 +gfx950,256,1,800,5120,torch.int8,34,0,11.5471,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.71,355.3,0.0 +gfx950,256,1,1280,8192,torch.int8,34,0,17.6914,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1.19,593.31,0.0 +gfx950,256,1,1280,8192,torch.float8_e4m3fn,79,3,7.6797,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,2.73,1366.79,0.1344 +gfx950,256,1,2304,16384,torch.int8,34,0,28.7938,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2.62,1311.73,0.0 +gfx950,256,1,2560,8192,torch.int8,34,0,17.4658,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2.4,1201.48,0.0 +gfx950,256,1,4608,16384,torch.int8,34,0,30.4899,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,4.95,2476.99,0.0 +gfx950,256,1,5120,640,torch.int8,79,0,4.3227,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,1.52,760.56,0.0 +gfx950,256,1,5120,1280,torch.int8,34,0,6.1049,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2.15,1075.39,0.0 +gfx950,256,1,5120,3200,torch.int8,30,0,9.8138,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,3.34,1670.86,0.0 +gfx950,256,1,5120,5120,torch.int8,34,0,12.3186,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,4.26,2129.28,0.0 +gfx950,256,1,5120,6400,torch.int8,34,0,14.5779,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,4.5,2248.93,0.0 +gfx950,256,1,5120,25600,torch.int8,23,0,47.1407,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,5.56,2781.2,0.0 +gfx950,256,1,6400,5120,torch.int8,34,0,12.7506,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,5.14,2571.32,0.0 +gfx950,256,1,7168,8192,torch.int8,34,0,19.015,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,6.18,3089.29,0.0 +gfx950,256,1,8192,1024,torch.int8,79,0,5.7218,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,2.93,1469.12,0.0 +gfx950,256,1,8192,1024,torch.float8_e4m3fn,60,0,6.3982,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,2.62,1313.81,0.0 +gfx950,256,1,8192,2048,torch.int8,34,0,7.1625,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,4.68,2344.94,0.0 +gfx950,256,1,8192,3584,torch.int8,34,0,11.2488,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,5.22,2611.84,0.0 +gfx950,256,1,8192,7168,torch.int8,34,0,16.9371,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,6.93,3468.35,0.0 +gfx950,256,1,8192,8192,torch.int8,34,0,19.4709,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,6.89,3447.89,0.0 +gfx950,256,1,8192,28672,torch.int8,34,0,59.8294,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,7.85,3926.6,0.0 +gfx950,256,1,9216,16384,torch.int8,34,0,34.1179,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,8.85,4426.7,0.0 +gfx950,256,1,10240,8192,torch.int8,34,0,20.5574,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,8.16,4081.97,0.0 +gfx950,256,1,12800,5120,torch.int8,34,0,14.7015,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,8.92,4459.87,0.0 +gfx950,256,1,13312,16384,torch.int8,34,0,43.9839,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,9.92,4959.7,0.0 +gfx950,256,1,16384,2048,torch.int8,34,0,8.2219,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,8.16,4085.34,0.0 +gfx950,256,1,16384,4096,torch.int8,34,0,15.9093,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,8.44,4220.53,0.0 +gfx950,256,1,16384,6656,torch.int8,34,0,22.3992,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,9.74,4870.32,0.0 +gfx950,256,1,16384,8192,torch.int8,34,0,29.3945,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,9.13,4567.48,0.0 +gfx950,256,1,16384,13312,torch.int8,34,0,39.0876,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,11.16,5581.05,0.0 +gfx950,256,1,16384,26624,torch.int8,34,0,73.9812,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,11.79,5897.0,0.0 +gfx950,256,1,26624,16384,torch.int8,34,0,88.2574,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,9.88,4943.24,0.0 +gfx950,256,1,51200,5120,torch.int8,30,0,50.042,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,10.48,5240.63,0.0 +gfx950,256,1,53248,16384,torch.int8,57,0,170.1222,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_intrawave_v2,10.26,5128.89,0.0 +gfx950,256,1,57344,8192,torch.int8,76,0,88.899,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,10.57,5285.6,0.0 +gfx950,256,16,100,5120,torch.int8,34,0,8.7447,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1.87,68.28,0.0 +gfx950,256,16,200,5120,torch.int8,34,0,10.4705,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,3.13,106.23,0.0 +gfx950,256,16,800,5120,torch.int8,34,0,11.7219,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,11.18,358.6,0.0 +gfx950,256,16,1024,8192,torch.float8_e4m3fn,79,3,8.199,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,32.74,1043.11,0.1616 +gfx950,256,16,1280,8192,torch.int8,34,0,15.0987,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.22,705.87,0.0 +gfx950,256,16,1280,8192,torch.float8_e4m3fn,79,3,8.8968,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,37.72,1197.94,0.1643 +gfx950,256,16,2304,16384,torch.int8,34,0,23.498,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,51.41,1620.76,0.0 +gfx950,256,16,2560,8192,torch.int8,34,0,14.8202,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,45.28,1429.43,0.0 +gfx950,256,16,3584,8192,torch.float8_e4m3fn,34,2,10.689,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,87.9,2769.75,0.1092 +gfx950,256,16,4608,16384,torch.int8,34,0,25.7089,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,93.97,2952.56,0.0 +gfx950,256,16,5120,640,torch.int8,60,0,4.3842,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,23.92,787.12,0.0 +gfx950,256,16,5120,1280,torch.int8,79,0,6.1038,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,34.36,1103.89,0.0 +gfx950,256,16,5120,3200,torch.int8,79,0,9.1289,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,57.43,1818.3,0.0 +gfx950,256,16,5120,5120,torch.int8,34,0,11.1363,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,75.33,2376.03,0.0 +gfx950,256,16,5120,6400,torch.int8,34,0,14.8838,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,70.45,2219.48,0.0 +gfx950,256,16,5120,25600,torch.int8,34,0,38.7631,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,108.2,3396.15,0.0 +gfx950,256,16,6400,5120,torch.int8,34,0,11.5523,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,90.77,2861.31,0.0 +gfx950,256,16,7168,8192,torch.int8,34,0,16.6523,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,112.84,3547.9,0.0 +gfx950,256,16,7168,8192,torch.float8_e4m3fn,80,2,15.5438,a8w8_rowwise_128x16x64x128_16x16_1x2_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,120.89,3800.92,0.1089 +gfx950,256,16,7424,8192,torch.float8_e4m3fn,60,2,16.1728,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,120.34,3783.27,0.1076 +gfx950,256,16,8192,1024,torch.int8,79,0,5.4586,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,49.18,1587.79,0.0 +gfx950,256,16,8192,2048,torch.int8,79,0,7.2552,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,74.0,2353.09,0.0 +gfx950,256,16,8192,3584,torch.int8,34,0,10.6477,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,88.24,2787.42,0.0 +gfx950,256,16,8192,7168,torch.int8,34,0,15.2131,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,123.52,3884.62,0.0 +gfx950,256,16,8192,8192,torch.int8,34,0,17.2283,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,124.65,3918.09,0.0 +gfx950,256,16,8192,8192,torch.float8_e4m3fn,80,2,17.7026,a8w8_rowwise_128x16x64x128_16x16_1x2_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,121.31,3813.12,0.107 +gfx950,256,16,8192,28672,torch.int8,34,0,55.9502,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,134.34,4210.92,0.0 +gfx950,256,16,9216,16384,torch.int8,34,0,33.174,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,145.65,4568.4,0.0 +gfx950,256,16,10240,8192,torch.int8,34,0,18.6072,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,144.26,4532.91,0.0 +gfx950,256,16,10240,8192,torch.float8_e4m3fn,34,0,19.5591,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,137.24,4312.31,0.0 +gfx950,256,16,12800,5120,torch.int8,34,0,14.2563,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,147.1,4631.46,0.0 +gfx950,256,16,13312,16384,torch.int8,30,0,45.4589,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,153.53,4812.96,0.0 +gfx950,256,16,16384,2048,torch.int8,34,0,8.2068,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,130.84,4156.49,0.0 +gfx950,256,16,16384,4096,torch.int8,30,0,16.22,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,132.4,4173.78,0.0 +gfx950,256,16,16384,6656,torch.int8,34,0,21.7401,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,160.52,5045.18,0.0 +gfx950,256,16,16384,8192,torch.int8,30,0,29.6526,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,144.84,4548.44,0.0 +gfx950,256,16,16384,13312,torch.int8,34,0,38.6763,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,180.45,5658.27,0.0 +gfx950,256,16,16384,26624,torch.int8,34,0,73.7744,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,189.21,5925.6,0.0 +gfx950,256,16,26624,16384,torch.int8,30,0,86.8898,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,160.65,5033.06,0.0 +gfx950,256,16,28672,8192,torch.float8_e4m3fn,30,0,46.9441,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,160.11,5025.76,0.0 +gfx950,256,16,51200,5120,torch.int8,30,0,51.7349,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,162.15,5100.32,0.0 +gfx950,256,16,53248,16384,torch.int8,34,0,180.7857,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,154.42,4836.56,0.0 +gfx950,256,16,57344,8192,torch.int8,80,0,90.9234,a8w8_rowwise_128x16x64x128_16x16_1x2_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,165.33,5188.19,0.0 +gfx950,256,16,57344,8192,torch.float8_e4m3fn,69,2,92.3442,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v1,162.79,5108.37,0.1072 +gfx950,256,32,100,5120,torch.int8,34,0,9.2459,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,3.54,73.79,0.0 +gfx950,256,32,200,5120,torch.int8,34,0,10.4825,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,6.25,114.54,0.0 +gfx950,256,32,800,5120,torch.int8,34,0,11.9874,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,21.87,359.63,0.0 +gfx950,256,32,1024,8192,torch.float8_e4m3fn,79,3,8.7911,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,61.07,991.49,0.1586 +gfx950,256,32,1280,8192,torch.int8,34,0,14.9955,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,44.75,722.2,0.0 +gfx950,256,32,1280,8192,torch.float8_e4m3fn,60,3,9.3466,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,71.8,1158.69,0.1581 +gfx950,256,32,2304,16384,torch.int8,34,0,23.2436,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,103.94,1652.95,0.0 +gfx950,256,32,2560,8192,torch.int8,34,0,14.7991,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,90.69,1445.87,0.0 +gfx950,256,32,3584,8192,torch.float8_e4m3fn,34,2,12.64,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,148.66,2361.68,0.1084 +gfx950,256,32,4608,16384,torch.int8,34,0,24.957,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,193.61,3057.93,0.0 +gfx950,256,32,5120,640,torch.int8,79,0,4.5986,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,45.6,788.27,0.0 +gfx950,256,32,5120,1280,torch.int8,76,0,6.0464,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,69.37,1144.85,0.0 +gfx950,256,32,5120,3200,torch.int8,76,0,10.1191,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,103.62,1661.62,0.0 +gfx950,256,32,5120,5120,torch.int8,34,0,10.4248,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,160.94,2561.77,0.0 +gfx950,256,32,5120,6400,torch.int8,34,0,15.0885,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,138.99,2207.01,0.0 +gfx950,256,32,5120,25600,torch.int8,34,0,38.3258,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,218.88,3449.87,0.0 +gfx950,256,32,6400,5120,torch.int8,34,0,10.9816,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,190.97,3036.12,0.0 +gfx950,256,32,7168,8192,torch.int8,34,0,16.942,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,221.82,3508.51,0.0 +gfx950,256,32,7168,8192,torch.float8_e4m3fn,34,0,17.4961,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,214.8,3397.39,0.0 +gfx950,256,32,7424,8192,torch.float8_e4m3fn,34,0,17.6324,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,220.75,3491.0,0.0 +gfx950,256,32,8192,1024,torch.int8,79,0,5.6281,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,95.39,1589.46,0.0 +gfx950,256,32,8192,1024,torch.float8_e4m3fn,79,0,6.5212,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,82.33,1371.78,0.0 +gfx950,256,32,8192,2048,torch.int8,76,0,7.2889,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,147.31,2382.67,0.0 +gfx950,256,32,8192,3584,torch.int8,34,0,9.4243,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,199.38,3183.17,0.0 +gfx950,256,32,8192,7168,torch.int8,34,0,16.2902,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,230.7,3650.9,0.0 +gfx950,256,32,8192,8192,torch.int8,34,0,17.4586,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,246.01,3888.93,0.0 +gfx950,256,32,8192,8192,torch.float8_e4m3fn,34,0,18.0338,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,238.16,3764.89,0.0 +gfx950,256,32,8192,28672,torch.int8,34,0,54.7382,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,274.62,4317.33,0.0 +gfx950,256,32,9216,16384,torch.int8,30,0,34.6699,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,278.73,4387.35,0.0 +gfx950,256,32,10240,8192,torch.int8,30,0,20.8953,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,256.93,4058.5,0.0 +gfx950,256,32,10240,8192,torch.float8_e4m3fn,30,0,21.4049,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,250.82,3961.88,0.0 +gfx950,256,32,12800,5120,torch.int8,30,0,14.9578,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,280.41,4447.11,0.0 +gfx950,256,32,13312,16384,torch.int8,30,0,46.5921,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,299.59,4710.67,0.0 +gfx950,256,32,16384,2048,torch.int8,30,0,10.4352,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,205.79,3322.27,0.0 +gfx950,256,32,16384,4096,torch.int8,30,0,16.4693,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,260.79,4146.41,0.0 +gfx950,256,32,16384,6656,torch.int8,30,0,22.4014,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,311.56,4924.4,0.0 +gfx950,256,32,16384,8192,torch.int8,30,0,30.4324,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,282.26,4453.43,0.0 +gfx950,256,32,16384,13312,torch.int8,30,0,39.8351,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,350.41,5512.18,0.0 +gfx950,256,32,16384,26624,torch.int8,30,0,74.3742,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,375.36,5890.59,0.0 +gfx950,256,32,26624,16384,torch.int8,30,0,89.3239,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,312.54,4908.38,0.0 +gfx950,256,32,28672,8192,torch.float8_e4m3fn,23,0,47.2467,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,318.17,5015.76,0.0 +gfx950,256,32,51200,5120,torch.int8,30,0,53.9026,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,311.25,4927.12,0.0 +gfx950,256,32,53248,16384,torch.int8,34,0,168.8762,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,330.62,5189.29,0.0 +gfx950,256,32,57344,8192,torch.int8,81,0,93.0943,a8w8_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_interwave_v2,322.95,5088.33,0.0 +gfx950,256,32,57344,8192,torch.float8_e4m3fn,62,0,92.8953,a8w8_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_intrawave_v2,323.64,5099.23,0.0 +gfx950,256,48,7424,8192,torch.float8_e4m3fn,74,2,20.2638,a8w8_rowwise_128x64x32x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,288.12,3055.86,0.1076 +gfx950,256,64,100,5120,torch.int8,34,0,9.6716,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,6.78,88.14,0.0 +gfx950,256,64,192,1024,torch.int8,77,0,4.1582,a8w8_rowwise_64x16x16x64_16x16_1x1_4x16x1_4x16x1_1x16x1x4_4x4x1_1x1_interwave_v2,6.05,68.95,0.0 +gfx950,256,64,200,5120,torch.int8,34,0,9.9254,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,13.21,138.76,0.0 +gfx950,256,64,800,5120,torch.int8,34,0,12.0241,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,43.6,376.42,0.0 +gfx950,256,64,1024,8192,torch.float8_e4m3fn,34,2,9.0741,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,118.33,996.68,0.1083 +gfx950,256,64,1280,8192,torch.int8,34,0,14.7691,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,90.88,756.57,0.0 +gfx950,256,64,1280,8192,torch.float8_e4m3fn,79,2,11.0767,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,121.17,1008.77,0.1076 +gfx950,256,64,2304,16384,torch.int8,34,0,23.2558,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,207.77,1680.97,0.0 +gfx950,256,64,2560,8192,torch.int8,34,0,14.5967,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,183.9,1495.1,0.0 +gfx950,256,64,3584,8192,torch.float8_e4m3fn,55,2,15.5912,a8w8_rowwise_128x64x32x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,241.04,1946.17,0.1078 +gfx950,256,64,4608,16384,torch.int8,30,0,27.9735,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,345.46,2757.46,0.0 +gfx950,256,64,5120,640,torch.int8,77,0,5.0956,a8w8_rowwise_64x16x16x64_16x16_1x1_4x16x1_4x16x1_1x16x1x4_4x4x1_1x1_interwave_v2,82.31,779.72,0.0 +gfx950,256,64,5120,1280,torch.int8,76,0,6.4203,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,130.66,1135.6,0.0 +gfx950,256,64,5120,3200,torch.int8,76,0,10.7687,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,194.75,1601.32,0.0 +gfx950,256,64,5120,5120,torch.int8,30,0,11.7059,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,286.65,2323.4,0.0 +gfx950,256,64,5120,6400,torch.int8,30,0,16.907,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,248.08,2001.12,0.0 +gfx950,256,64,5120,25600,torch.int8,30,0,43.5801,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,384.97,3060.24,0.0 +gfx950,256,64,6400,5120,torch.int8,30,0,12.4817,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,336.04,2717.17,0.0 +gfx950,256,64,7168,8192,torch.int8,30,0,18.4286,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,407.85,3264.6,0.0 +gfx950,256,64,7168,8192,torch.float8_e4m3fn,30,0,19.78,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,379.99,3041.56,0.0 +gfx950,256,64,7424,8192,torch.float8_e4m3fn,30,0,20.8916,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,372.62,2981.68,0.0 +gfx950,256,64,8192,1024,torch.float8_e4m3fn,79,0,7.3716,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,145.66,1289.1,0.0 +gfx950,256,64,8192,2048,torch.int8,30,0,7.5798,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,283.32,2369.04,0.0 +gfx950,256,64,8192,3584,torch.int8,30,0,11.279,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,333.19,2716.38,0.0 +gfx950,256,64,8192,7168,torch.int8,30,0,17.495,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,429.62,3442.56,0.0 +gfx950,256,64,8192,8192,torch.int8,30,0,19.4045,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,442.68,3539.47,0.0 +gfx950,256,64,8192,8192,torch.float8_e4m3fn,30,0,19.8829,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,432.03,3454.31,0.0 +gfx950,256,64,8192,28672,torch.int8,30,0,61.7622,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,486.78,3849.68,0.0 +gfx950,256,64,9216,16384,torch.int8,23,0,41.0647,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,470.66,3731.26,0.0 +gfx950,256,64,10240,8192,torch.int8,28,0,27.6213,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,388.74,3103.44,0.0 +gfx950,256,64,10240,8192,torch.float8_e4m3fn,23,0,24.8931,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,431.34,3443.57,0.0 +gfx950,256,64,12800,5120,torch.int8,23,0,18.5404,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,452.45,3640.81,0.0 +gfx950,256,64,13312,16384,torch.int8,23,0,50.0251,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,558.07,4414.91,0.0 +gfx950,256,64,16384,2048,torch.int8,23,0,11.4046,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,376.6,3137.56,0.0 +gfx950,256,64,16384,4096,torch.int8,23,0,19.0989,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,449.76,3637.29,0.0 +gfx950,256,64,16384,6656,torch.int8,23,0,26.4384,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,527.97,4220.19,0.0 +gfx950,256,64,16384,8192,torch.int8,23,0,33.8597,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,507.38,4041.36,0.0 +gfx950,256,64,16384,13312,torch.int8,23,0,46.8247,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,596.21,4720.86,0.0 +gfx950,256,64,16384,26624,torch.int8,23,0,89.8884,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,621.15,4895.06,0.0 +gfx950,256,64,26624,16384,torch.int8,23,0,91.2922,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,611.6,4826.96,0.0 +gfx950,256,64,28672,8192,torch.float8_e4m3fn,23,0,52.7168,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,570.31,4535.09,0.0 +gfx950,256,64,51200,5120,torch.int8,23,0,61.5287,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,545.35,4372.35,0.0 +gfx950,256,64,53248,16384,torch.int8,23,0,184.3385,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,605.78,4775.34,0.0 +gfx950,256,64,57344,8192,torch.int8,46,0,99.6635,a8w8_rowwise_256x64x128x128_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,603.33,4792.39,0.0 +gfx950,256,64,57344,8192,torch.float8_e4m3fn,17,0,98.9474,a8w8_rowwise_256x64x224x128_16x16_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,607.69,4827.07,0.0 +gfx950,256,128,100,5120,torch.int8,34,0,10.6683,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,12.29,111.82,0.0 +gfx950,256,128,128,49920,torch.float8_e4m3fn,34,0,75.6047,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,21.64,169.46,0.0 +gfx950,256,128,128,80000,torch.float8_e4m3fn,34,0,119.6585,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,21.91,171.43,0.0 +gfx950,256,128,128,222336,torch.float8_e4m3fn,34,0,324.8197,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.43,175.33,0.0001 +gfx950,256,128,128,254848,torch.float8_e4m3fn,34,0,371.2338,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.49,175.83,0.0002 +gfx950,256,128,128,322816,torch.float8_e4m3fn,34,0,466.5241,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.67,177.21,0.0001 +gfx950,256,128,128,423168,torch.float8_e4m3fn,34,0,611.0012,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.69,177.35,0.0002 +gfx950,256,128,128,620160,torch.float8_e4m3fn,34,0,897.8064,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.63,176.87,0.0002 +gfx950,256,128,128,659584,torch.float8_e4m3fn,34,0,953.0292,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.68,177.21,0.0002 +gfx950,256,128,128,796544,torch.float8_e4m3fn,34,0,1151.3783,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.67,177.13,0.0004 +gfx950,256,128,128,941696,torch.float8_e4m3fn,34,0,1362.3745,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,22.65,176.98,0.0002 +gfx950,256,128,200,5120,torch.int8,34,0,9.652,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,27.16,179.3,0.0 +gfx950,256,128,256,256,torch.float8_e4m3fn,68,0,3.1268,a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v1,5.37,52.4,0.0 +gfx950,256,128,256,1024,torch.float8_e4m3fn,79,0,4.0962,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,16.38,111.99,0.0 +gfx950,256,128,800,5120,torch.int8,34,0,11.5628,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,90.69,428.63,0.0 +gfx950,256,128,1024,8192,torch.float8_e4m3fn,34,2,12.0372,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,178.4,805.78,0.1084 +gfx950,256,128,1280,8192,torch.int8,34,0,13.9827,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,191.98,848.34,0.0 +gfx950,256,128,1280,8192,torch.float8_e4m3fn,29,3,13.1684,a8w8_rowwise_256x32x96x256_16x16_1x3_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3,203.85,900.79,0.1595 +gfx950,256,128,2304,16384,torch.int8,30,0,26.5035,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,364.62,1525.67,0.0 +gfx950,256,128,2560,8192,torch.int8,30,0,16.5177,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,325.03,1372.8,0.0 +gfx950,256,128,3584,8192,torch.float8_e4m3fn,30,0,17.8922,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,420.08,1750.83,0.0 +gfx950,256,128,4608,16384,torch.int8,23,0,33.4047,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,578.58,2358.18,0.0 +gfx950,256,128,5120,640,torch.int8,79,0,5.566,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,150.71,838.92,0.0 +gfx950,256,128,5120,1280,torch.int8,23,0,7.6695,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,218.75,1046.76,0.0 +gfx950,256,128,5120,3200,torch.int8,23,0,11.8363,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,354.36,1529.56,0.0 +gfx950,256,128,5120,5120,torch.int8,23,0,14.1859,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,473.07,1986.51,0.0 +gfx950,256,128,5120,6400,torch.int8,23,0,18.7826,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,446.62,1857.99,0.0 +gfx950,256,128,5120,25600,torch.int8,23,0,54.0309,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,621.02,2510.78,0.0 +gfx950,256,128,6400,5120,torch.int8,23,0,14.96,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,560.74,2343.7,0.0 +gfx950,256,128,7168,8192,torch.int8,28,0,26.0097,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,577.95,2368.49,0.0 +gfx950,256,128,7168,8192,torch.float8_e4m3fn,23,0,22.197,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,677.23,2775.32,0.0 +gfx950,256,128,8192,1024,torch.int8,28,0,7.1186,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,301.67,1491.42,0.0 +gfx950,256,128,8192,1024,torch.float8_e4m3fn,15,0,8.3628,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,256.79,1269.53,0.0 +gfx950,256,128,8192,2048,torch.int8,23,0,10.2945,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,417.21,1858.91,0.0 +gfx950,256,128,8192,3584,torch.int8,28,0,13.8708,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,541.87,2300.95,0.0 +gfx950,256,128,8192,7168,torch.int8,28,0,24.2653,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,619.5,2544.16,0.0 +gfx950,256,128,8192,8192,torch.int8,28,0,26.3513,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,651.96,2666.08,0.0 +gfx950,256,128,8192,8192,torch.float8_e4m3fn,23,0,23.2545,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,738.78,3021.12,0.0 +gfx950,256,128,8192,28672,torch.int8,28,0,87.5085,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,687.13,2750.0,0.0 +gfx950,256,128,9216,16384,torch.int8,23,0,59.454,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,650.16,2614.65,0.0 +gfx950,256,128,10240,8192,torch.int8,15,0,31.2683,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,686.79,2800.16,0.0 +gfx950,256,128,10240,8192,torch.float8_e4m3fn,22,0,30.8204,a8w8_rowwise_256x64x96x256_16x16_2x3_16x16x1_16x16x1_1x64x1x4_8x8x1_2x1_intrawave_v3,696.77,2840.85,0.0 +gfx950,256,128,12800,5120,torch.int8,23,0,25.5896,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,655.63,2714.7,0.0 +gfx950,256,128,13312,16384,torch.int8,23,0,65.9651,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,846.43,3389.81,0.0 +gfx950,256,128,16384,2048,torch.int8,23,0,14.5668,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,589.69,2609.42,0.0 +gfx950,256,128,16384,4096,torch.int8,23,0,23.3525,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,735.68,3075.79,0.0 +gfx950,256,128,16384,6656,torch.int8,23,0,34.0329,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,820.3,3352.58,0.0 +gfx950,256,128,16384,8192,torch.int8,23,0,45.1385,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,761.21,3089.62,0.0 +gfx950,256,128,16384,13312,torch.int8,23,0,61.6432,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,905.77,3633.85,0.0 +gfx950,256,128,16384,26624,torch.int8,23,0,118.3409,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,943.62,3750.27,0.0 +gfx950,256,128,26624,16384,torch.int8,23,0,129.1529,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,864.63,3446.46,0.0 +gfx950,256,128,28672,8192,torch.float8_e4m3fn,41,0,60.0008,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1002.15,4054.44,0.0 +gfx950,256,128,51200,5120,torch.int8,23,0,82.0559,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,817.84,3362.42,0.0 +gfx950,256,128,53248,16384,torch.int8,23,0,230.1893,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,970.24,3858.32,0.0 +gfx950,256,128,57344,8192,torch.int8,13,0,108.1045,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1112.43,4490.94,0.0 +gfx950,256,128,57344,8192,torch.float8_e4m3fn,35,0,111.2293,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v4,1081.18,4364.77,0.0 +gfx950,256,128,105984,128,torch.float8_e4m3fn,44,0,15.537,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,223.52,2620.47,0.0 +gfx950,256,128,207616,128,torch.float8_e4m3fn,44,0,25.7878,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,263.81,3092.2,0.0 +gfx950,256,128,270080,128,torch.float8_e4m3fn,44,0,32.2348,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,274.55,3217.86,0.0 +gfx950,256,128,430336,128,torch.float8_e4m3fn,44,0,46.5205,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,303.12,3552.53,0.0 +gfx950,256,128,450560,128,torch.float8_e4m3fn,44,0,48.3917,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,305.09,3575.64,0.0 +gfx950,256,128,452352,128,torch.float8_e4m3fn,44,0,49.0249,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,302.35,3543.5,0.0 +gfx950,256,128,691968,128,torch.float8_e4m3fn,44,0,71.2418,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,318.27,3730.0,0.0 +gfx950,256,128,772736,128,torch.float8_e4m3fn,44,0,79.0658,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,320.25,3753.17,0.0 +gfx950,256,128,855424,128,torch.float8_e4m3fn,44,0,86.4286,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,324.32,3800.82,0.0 +gfx950,256,128,911744,128,torch.float8_e4m3fn,44,0,91.8719,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1,325.19,3811.02,0.0 +gfx950,256,192,1280,8192,torch.float8_e4m3fn,34,0,15.3959,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,261.53,815.16,0.0 +gfx950,256,192,8192,1024,torch.float8_e4m3fn,21,0,8.6496,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,372.41,1356.24,0.0 +gfx950,256,256,100,5120,torch.int8,34,0,10.9556,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,23.93,171.05,0.0 +gfx950,256,256,200,5120,torch.int8,34,0,9.8147,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,53.42,248.31,0.0 +gfx950,256,256,256,128,torch.float8_e4m3fn,49,0,2.7541,a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_intrawave_v1,6.09,71.39,0.0 +gfx950,256,256,512,640,torch.float8_e4m3fn,79,0,3.6844,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,45.54,204.56,0.0 +gfx950,256,256,800,5120,torch.int8,34,0,11.8423,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,177.09,491.15,0.0 +gfx950,256,256,1024,8192,torch.float8_e4m3fn,34,0,14.7059,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,292.06,748.68,0.0 +gfx950,256,256,1280,8192,torch.int8,30,0,16.2863,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,329.65,812.85,0.0 +gfx950,256,256,1280,8192,torch.float8_e4m3fn,30,0,17.037,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,315.12,777.03,0.0 +gfx950,256,256,2304,16384,torch.int8,23,0,32.4817,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,595.02,1327.6,0.0 +gfx950,256,256,2560,8192,torch.int8,28,0,23.112,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,464.58,1054.84,0.0 +gfx950,256,256,3584,8192,torch.float8_e4m3fn,23,0,20.9257,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,718.37,1590.98,0.0 +gfx950,256,256,4608,16384,torch.int8,23,0,55.0189,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,702.57,1491.33,0.0 +gfx950,256,256,5120,640,torch.int8,81,0,6.6084,a8w8_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_interwave_v2,253.88,917.33,0.0 +gfx950,256,256,5120,1280,torch.int8,15,0,8.2757,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,405.46,1148.27,0.0 +gfx950,256,256,5120,3200,torch.int8,15,0,15.7837,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,531.47,1256.02,0.0 +gfx950,256,256,5120,5120,torch.int8,23,0,21.8405,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,614.54,1380.31,0.0 +gfx950,256,256,5120,6400,torch.int8,21,0,23.7832,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,705.42,1556.89,0.0 +gfx950,256,256,5120,25600,torch.int8,23,0,86.0166,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,780.19,1630.46,0.0 +gfx950,256,256,6400,5120,torch.int8,23,0,22.7861,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,736.29,1639.4,0.0 +gfx950,256,256,7168,8192,torch.int8,15,0,30.291,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,992.53,2128.93,0.0 +gfx950,256,256,7168,8192,torch.float8_e4m3fn,15,0,30.0111,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1001.79,2148.79,0.0 +gfx950,256,256,8192,1024,torch.int8,21,0,8.6043,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,499.17,1492.86,0.0 +gfx950,256,256,8192,1024,torch.float8_e4m3fn,15,0,9.1986,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,466.92,1396.41,0.0 +gfx950,256,256,8192,2048,torch.int8,15,0,10.9387,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,785.28,1965.12,0.0 +gfx950,256,256,8192,3584,torch.int8,15,0,16.5846,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,906.41,2078.55,0.0 +gfx950,256,256,8192,7168,torch.int8,21,0,29.138,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1031.81,2222.17,0.0 +gfx950,256,256,8192,8192,torch.int8,15,0,31.6256,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1086.45,2320.91,0.0 +gfx950,256,256,8192,8192,torch.float8_e4m3fn,15,0,31.0706,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1105.86,2362.37,0.0 +gfx950,256,256,8192,28672,torch.int8,15,0,102.0397,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1178.55,2414.9,0.0 +gfx950,256,256,9216,16384,torch.int8,23,0,86.8248,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,890.41,1841.73,0.0 +gfx950,256,256,10240,8192,torch.int8,41,0,42.9954,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,998.94,2121.76,0.0 +gfx950,256,256,10240,8192,torch.float8_e4m3fn,14,0,37.6153,a8w8_rowwise_256x128x96x256_32x32_1x3_16x16x1_16x16x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1141.81,2425.24,0.0 +gfx950,256,256,12800,5120,torch.int8,15,0,42.5463,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,788.66,1725.19,0.0 +gfx950,256,256,13312,16384,torch.int8,23,0,113.4219,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,984.55,2020.01,0.0 +gfx950,256,256,16384,2048,torch.int8,15,0,22.0982,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,777.43,1921.76,0.0 +gfx950,256,256,16384,4096,torch.int8,15,0,39.8495,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,862.24,1920.88,0.0 +gfx950,256,256,16384,6656,torch.int8,23,0,57.6756,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,968.08,2065.77,0.0 +gfx950,256,256,16384,8192,torch.int8,15,0,72.9104,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,942.52,1984.68,0.0 +gfx950,256,256,16384,13312,torch.int8,23,0,106.2833,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1050.67,2163.09,0.0 +gfx950,256,256,16384,26624,torch.int8,23,0,202.2465,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1104.29,2231.99,0.0 +gfx950,256,256,26624,16384,torch.int8,23,0,206.2403,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1082.9,2201.48,0.0 +gfx950,256,256,28672,8192,torch.float8_e4m3fn,4,0,80.7001,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1490.2,3118.44,0.0 +gfx950,256,256,51200,5120,torch.int8,40,0,120.9285,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1109.89,2395.38,0.0 +gfx950,256,256,53248,16384,torch.int8,40,0,368.0253,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1213.71,2456.01,0.0 +gfx950,256,256,57344,8192,torch.int8,2,0,135.372,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1776.72,3702.53,0.0 +gfx950,256,256,57344,8192,torch.float8_e4m3fn,2,0,140.7498,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1708.83,3561.07,0.0 +gfx950,256,320,1280,8192,torch.float8_e4m3fn,30,0,17.4934,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,383.62,796.09,0.0 +gfx950,256,320,8192,1024,torch.float8_e4m3fn,62,0,12.0692,a8w8_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_intrawave_v2,444.83,1156.59,0.0 +gfx950,256,384,8448,30080,torch.float8_e4m3fn,41,0,147.1478,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1326.29,1849.53,0.0 +gfx950,256,512,100,5120,torch.int8,34,0,11.1055,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,47.21,291.37,0.0 +gfx950,256,512,200,5120,torch.int8,34,0,11.8774,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,88.28,324.17,0.0 +gfx950,256,512,800,5120,torch.int8,30,0,13.1908,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,317.97,571.36,0.0 +gfx950,256,512,1280,8192,torch.int8,28,0,23.2059,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,462.7,689.08,0.0 +gfx950,256,512,1280,8192,torch.float8_e4m3fn,23,0,21.2265,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,505.85,753.34,0.0 +gfx950,256,512,1792,7424,torch.float8_e4m3fn,23,0,20.9911,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,648.99,902.28,0.0 +gfx950,256,512,1920,6784,torch.float8_e4m3fn,23,0,19.552,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,682.18,944.39,0.0 +gfx950,256,512,2304,16384,torch.int8,23,0,54.2058,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,713.11,894.68,0.0 +gfx950,256,512,2560,8192,torch.int8,21,0,27.6646,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,776.26,1004.43,0.0 +gfx950,256,512,4608,16384,torch.int8,23,0,87.1647,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,886.93,1016.52,0.0 +gfx950,256,512,5120,640,torch.int8,47,0,8.4036,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,399.29,1052.81,0.0 +gfx950,256,512,5120,1280,torch.int8,22,0,12.2285,a8w8_rowwise_256x64x96x256_16x16_2x3_16x16x1_16x16x1_1x64x1x4_8x8x1_2x1_intrawave_v3,548.79,1018.26,0.0 +gfx950,256,512,5120,3200,torch.int8,13,0,21.8498,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,767.84,1064.78,0.0 +gfx950,256,512,5120,5120,torch.int8,23,0,33.1376,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,810.06,1028.4,0.0 +gfx950,256,512,5120,6400,torch.int8,15,0,38.8017,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,864.77,1064.07,0.0 +gfx950,256,512,5120,25600,torch.int8,23,0,131.897,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1017.6,1132.87,0.0 +gfx950,256,512,6400,5120,torch.int8,15,0,41.3468,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,811.54,1014.42,0.0 +gfx950,256,512,7168,8192,torch.int8,41,0,47.5197,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1265.36,1478.43,0.0 +gfx950,256,512,8192,1024,torch.int8,13,0,11.7749,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,729.51,1469.35,0.0 +gfx950,256,512,8192,1024,torch.float8_e4m3fn,13,0,12.6356,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,679.82,1369.27,0.0 +gfx950,256,512,8192,2048,torch.int8,41,0,16.8904,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1017.14,1552.03,0.0 +gfx950,256,512,8192,3584,torch.int8,41,0,24.9561,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1204.71,1586.14,0.0 +gfx950,256,512,8192,7168,torch.int8,41,0,44.168,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1361.38,1602.49,0.0 +gfx950,256,512,8192,8192,torch.int8,13,0,51.209,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1341.94,1556.21,0.0 +gfx950,256,512,8192,28672,torch.int8,41,0,150.8316,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1594.61,1710.18,0.0 +gfx950,256,512,9216,16384,torch.int8,23,0,141.9471,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1089.27,1189.32,0.0 +gfx950,256,512,10240,8192,torch.int8,11,0,61.9597,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1386.37,1590.81,0.0 +gfx950,256,512,12800,5120,torch.int8,23,0,74.3061,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,903.14,1093.65,0.0 +gfx950,256,512,13312,16384,torch.int8,23,0,199.0315,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1122.13,1206.46,0.0 +gfx950,256,512,16384,2048,torch.int8,15,0,38.4305,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,894.07,1336.96,0.0 +gfx950,256,512,16384,4096,torch.int8,15,0,65.4454,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1050.03,1313.82,0.0 +gfx950,256,512,16384,6656,torch.int8,15,0,102.3241,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1091.33,1263.02,0.0 +gfx950,256,512,16384,8192,torch.int8,15,0,123.6929,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1111.13,1254.63,0.0 +gfx950,256,512,16384,13312,torch.int8,15,0,191.2442,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1167.82,1263.81,0.0 +gfx950,256,512,16384,26624,torch.int8,15,0,365.7012,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1221.43,1275.95,0.0 +gfx950,256,512,26624,16384,torch.int8,15,0,349.3418,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1278.62,1350.71,0.0 +gfx950,256,512,51200,5120,torch.int8,40,0,223.8159,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1199.36,1417.21,0.0 +gfx950,256,512,53248,16384,torch.int8,15,0,651.6344,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1370.94,1435.36,0.0 +gfx950,256,512,57344,8192,torch.int8,2,0,233.9286,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2056.34,2277.09,0.0 +gfx950,256,640,128,128,torch.float8_e4m3fn,50,0,2.799,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,7.49,93.66,0.0 +gfx950,256,640,256,768,torch.float8_e4m3fn,79,0,3.9362,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,63.93,258.07,0.0 +gfx950,256,640,384,128,torch.float8_e4m3fn,49,0,2.9702,a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_intrawave_v1,21.18,209.61,0.0 +gfx950,256,640,768,896,torch.float8_e4m3fn,76,0,5.2439,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2,167.97,428.04,0.0 +gfx950,256,1024,100,5120,torch.int8,34,0,11.6746,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,89.82,510.48,0.0 +gfx950,256,1024,200,5120,torch.int8,34,0,12.0055,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,174.68,556.12,0.0 +gfx950,256,1024,800,5120,torch.int8,23,0,15.5235,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,540.38,707.14,0.0 +gfx950,256,1024,1280,8192,torch.int8,21,0,27.7294,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,774.44,775.2,0.0 +gfx950,256,1024,1280,8192,torch.float8_e4m3fn,21,0,28.4599,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,754.56,755.3,0.0 +gfx950,256,1024,2304,16384,torch.int8,23,0,86.7135,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,891.55,683.22,0.0 +gfx950,256,1024,2560,8192,torch.int8,41,0,42.1202,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1019.69,821.53,0.0 +gfx950,256,1024,4608,16384,torch.int8,23,0,143.2677,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1079.23,709.94,0.0 +gfx950,256,1024,5120,640,torch.int8,11,0,11.7704,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,570.15,1224.93,0.0 +gfx950,256,1024,5120,1280,torch.int8,11,0,17.5871,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,763.16,1043.38,0.0 +gfx950,256,1024,5120,3200,torch.int8,11,0,27.688,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1211.88,1088.8,0.0 +gfx950,256,1024,5120,5120,torch.int8,23,0,55.7649,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,962.74,752.14,0.0 +gfx950,256,1024,5120,6400,torch.int8,11,0,57.5786,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1165.52,865.03,0.0 +gfx950,256,1024,5120,25600,torch.int8,23,0,227.1849,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1181.57,738.48,0.0 +gfx950,256,1024,6400,5120,torch.int8,23,0,72.0506,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,931.41,709.47,0.0 +gfx950,256,1024,7168,8192,torch.int8,13,0,82.6278,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1455.43,989.85,0.0 +gfx950,256,1024,8192,1024,torch.int8,13,0,17.3848,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,988.21,1507.89,0.0 +gfx950,256,1024,8192,1024,torch.float8_e4m3fn,13,0,18.4495,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,931.18,1420.87,0.0 +gfx950,256,1024,8192,2048,torch.int8,41,0,27.2285,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1261.9,1309.35,0.0 +gfx950,256,1024,8192,3584,torch.int8,13,0,43.9928,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1366.8,1132.17,0.0 +gfx950,256,1024,8192,7168,torch.int8,13,0,79.3336,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1515.87,1044.17,0.0 +gfx950,256,1024,8192,8192,torch.int8,13,0,88.9386,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1545.32,1037.51,0.0 +gfx950,256,1024,8192,28672,torch.int8,41,0,274.6402,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1751.51,1023.22,0.0 +gfx950,256,1024,9216,16384,torch.int8,15,0,243.9282,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1267.74,765.17,0.0 +gfx950,256,1024,10240,8192,torch.int8,4,0,89.391,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1921.88,1266.86,0.0 +gfx950,256,1024,12800,5120,torch.int8,40,0,104.2861,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1287.01,930.07,0.0 +gfx950,256,1024,13312,16384,torch.int8,40,0,306.5211,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1457.25,855.22,0.0 +gfx950,256,1024,16384,2048,torch.int8,1,0,61.8687,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1110.73,1118.59,0.0 +gfx950,256,1024,16384,4096,torch.int8,1,0,108.4683,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1267.09,966.71,0.0 +gfx950,256,1024,16384,6656,torch.int8,1,0,165.4754,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1349.68,902.99,0.0 +gfx950,256,1024,16384,8192,torch.int8,1,0,202.2384,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1359.18,871.05,0.0 +gfx950,256,1024,16384,13312,torch.int8,1,0,307.7016,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1451.66,862.17,0.0 +gfx950,256,1024,16384,26624,torch.int8,1,0,603.8507,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1479.43,823.09,0.0 +gfx950,256,1024,26624,16384,torch.int8,15,0,630.2527,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1417.45,805.25,0.0 +gfx950,256,1024,51200,5120,torch.int8,39,0,396.7395,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1353.21,938.26,0.0 +gfx950,256,1024,53248,16384,torch.int8,40,0,1195.3563,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1494.71,835.1,0.0 +gfx950,256,1024,57344,8192,torch.int8,2,0,438.8629,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2192.19,1357.12,0.0 +gfx950,256,2048,100,5120,torch.int8,34,0,11.6595,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,179.87,978.37,0.0 +gfx950,256,2048,200,5120,torch.int8,30,0,13.0876,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,320.48,942.03,0.0 +gfx950,256,2048,800,5120,torch.int8,23,0,24.3923,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,687.81,732.14,0.0 +gfx950,256,2048,1280,8192,torch.int8,41,0,42.0169,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1022.2,773.64,0.0 +gfx950,256,2048,1280,8192,torch.float8_e4m3fn,14,0,36.1844,a8w8_rowwise_256x128x96x256_32x32_1x3_16x16x1_16x16x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1186.97,898.34,0.0 +gfx950,256,2048,2304,16384,torch.int8,23,0,145.2155,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1064.75,556.0,0.0 +gfx950,256,2048,2560,8192,torch.int8,11,0,61.0366,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1407.34,790.26,0.0 +gfx950,256,2048,4608,16384,torch.int8,11,0,247.0665,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1251.64,517.78,0.0 +gfx950,256,2048,5120,640,torch.int8,10,0,17.6749,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,759.37,1446.06,0.0 +gfx950,256,2048,5120,1280,torch.int8,4,0,27.4475,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,978.0,1098.34,0.0 +gfx950,256,2048,5120,3200,torch.int8,4,0,44.9517,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1492.91,976.81,0.0 +gfx950,256,2048,5120,5120,torch.int8,15,0,96.048,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1117.92,600.45,0.0 +gfx950,256,2048,5120,6400,torch.int8,4,0,90.0028,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1491.26,742.72,0.0 +gfx950,256,2048,5120,25600,torch.int8,15,0,419.2679,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1280.5,487.69,0.0 +gfx950,256,2048,6400,5120,torch.int8,39,0,107.5421,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1248.05,645.96,0.0 +gfx950,256,2048,7168,8192,torch.int8,2,0,116.6871,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2061.22,898.62,0.0 +gfx950,256,2048,8192,1024,torch.int8,1,0,29.7892,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1153.43,1478.39,0.0 +gfx950,256,2048,8192,1024,torch.float8_e4m3fn,43,0,31.5009,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,1090.75,1398.06,0.0 +gfx950,256,2048,8192,2048,torch.int8,1,0,42.7444,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1607.68,1275.63,0.0 +gfx950,256,2048,8192,3584,torch.int8,1,0,65.1518,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1845.83,1078.32,0.0 +gfx950,256,2048,8192,7168,torch.int8,1,0,113.2102,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2124.53,944.74,0.0 +gfx950,256,2048,8192,8192,torch.int8,1,0,125.3472,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2192.93,936.92,0.0 +gfx950,256,2048,8192,28672,torch.int8,1,0,387.0603,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2485.59,845.23,0.0 +gfx950,256,2048,9216,16384,torch.int8,15,0,431.0243,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1434.9,515.74,0.0 +gfx950,256,2048,10240,8192,torch.int8,4,0,170.1034,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2019.93,838.35,0.0 +gfx950,256,2048,12800,5120,torch.int8,39,0,188.6903,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1422.62,680.75,0.0 +gfx950,256,2048,13312,16384,torch.int8,40,0,609.7983,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1465.0,502.11,0.0 +gfx950,256,2048,16384,2048,torch.int8,1,0,109.435,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1255.9,958.17,0.0 +gfx950,256,2048,16384,4096,torch.int8,1,0,192.1657,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1430.42,742.1,0.0 +gfx950,256,2048,16384,6656,torch.int8,1,0,297.4271,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1501.8,638.11,0.0 +gfx950,256,2048,16384,8192,torch.int8,1,0,351.3779,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1564.57,620.71,0.0 +gfx950,256,2048,16384,13312,torch.int8,1,0,562.4285,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1588.39,555.58,0.0 +gfx950,256,2048,16384,26624,torch.int8,1,0,1159.8861,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1540.42,480.95,0.0 +gfx950,256,2048,26624,16384,torch.int8,40,0,1145.701,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1559.49,505.21,0.0 +gfx950,256,2048,51200,5120,torch.int8,39,0,756.332,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1419.67,637.74,0.0 +gfx950,256,2048,53248,16384,torch.int8,1,0,2196.6099,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1626.79,511.73,0.0 +gfx950,256,2048,57344,8192,torch.int8,1,0,814.7053,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2361.77,885.5,0.0 +gfx950,256,4096,100,5120,torch.int8,34,0,16.1651,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,259.47,1379.68,0.0 +gfx950,256,4096,200,5120,torch.int8,23,0,15.2341,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3,550.65,1551.38,0.0 +gfx950,256,4096,800,5120,torch.int8,15,0,36.4711,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,920.03,867.02,0.0 +gfx950,256,4096,1280,8192,torch.int8,11,0,59.932,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1433.28,909.8,0.0 +gfx950,256,4096,1280,8192,torch.float8_e4m3fn,11,0,51.722,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1660.79,1054.21,0.0 +gfx950,256,4096,2304,16384,torch.int8,11,0,246.424,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,1254.9,502.11,0.0 +gfx950,256,4096,2560,8192,torch.int8,4,0,90.7509,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1893.08,831.92,0.0 +gfx950,256,4096,4608,16384,torch.int8,15,0,433.3975,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1427.04,416.14,0.0 +gfx950,256,4096,5120,640,torch.int8,13,0,32.4583,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,827.02,1473.93,0.0 +gfx950,256,4096,5120,1280,torch.int8,13,0,50.6317,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1060.35,1061.38,0.0 +gfx950,256,4096,5120,3200,torch.int8,4,0,86.6115,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1549.65,824.77,0.0 +gfx950,256,4096,5120,5120,torch.int8,15,0,177.0814,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1212.71,503.32,0.0 +gfx950,256,4096,5120,6400,torch.int8,4,0,162.2341,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1654.62,622.1,0.0 +gfx950,256,4096,5120,25600,torch.int8,15,0,782.4893,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1372.21,355.11,0.0 +gfx950,256,4096,6400,5120,torch.int8,39,0,188.523,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1423.89,563.16,0.0 +gfx950,256,4096,7168,8192,torch.int8,2,0,219.2844,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2193.66,688.58,0.0 +gfx950,256,4096,7424,8192,torch.float8_e4m3fn,1,0,232.7657,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2140.42,666.72,0.0 +gfx950,256,4096,8192,1024,torch.int8,1,0,55.6305,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1235.28,1432.52,0.0 +gfx950,256,4096,8192,1024,torch.float8_e4m3fn,43,0,59.9656,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,1145.98,1328.96,0.0 +gfx950,256,4096,8192,2048,torch.int8,1,0,83.0059,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1655.77,1111.66,0.0 +gfx950,256,4096,8192,3584,torch.int8,1,0,122.3648,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1965.58,908.34,0.0 +gfx950,256,4096,8192,7168,torch.int8,1,0,215.6239,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2230.9,719.72,0.0 +gfx950,256,4096,8192,8192,torch.int8,1,0,240.0438,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2290.23,698.92,0.0 +gfx950,256,4096,8192,28672,torch.int8,1,0,764.1022,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2518.18,548.92,0.0 +gfx950,256,4096,9216,16384,torch.int8,39,0,778.6417,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1588.6,377.07,0.0 +gfx950,256,4096,10240,8192,torch.int8,1,0,318.603,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2156.9,631.9,0.0 +gfx950,256,4096,12800,5120,torch.int8,39,0,356.6531,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1505.3,536.56,0.0 +gfx950,256,4096,13312,16384,torch.int8,10,0,1114.1158,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1603.7,353.88,0.0 +gfx950,256,4096,16384,2048,torch.int8,1,0,205.0841,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1340.32,858.97,0.0 +gfx950,256,4096,16384,4096,torch.int8,39,0,353.73,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1554.17,616.58,0.0 +gfx950,256,4096,16384,6656,torch.int8,39,0,561.0198,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1592.37,482.22,0.0 +gfx950,256,4096,16384,8192,torch.int8,39,0,657.0519,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1673.4,459.61,0.0 +gfx950,256,4096,16384,13312,torch.int8,1,0,1080.1678,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1654.1,376.65,0.0 +gfx950,256,4096,16384,26624,torch.int8,1,0,2197.8702,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1625.85,309.15,0.0 +gfx950,256,4096,26624,16384,torch.int8,10,0,2199.7579,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1624.46,327.95,0.0 +gfx950,256,4096,51200,5120,torch.int8,1,0,1418.8126,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1513.58,495.16,0.0 +gfx950,256,4096,53248,16384,torch.int8,1,0,4075.2597,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1753.71,337.58,0.0 +gfx950,256,4096,57344,8192,torch.int8,1,0,1617.8989,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2378.57,601.45,0.0 +gfx950,256,5120,7424,8192,torch.float8_e4m3fn,2,0,308.3064,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2019.97,579.88,0.0 +gfx950,256,8192,100,5120,torch.int8,29,0,27.2037,a8w8_rowwise_256x32x96x256_16x16_1x3_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3,308.36,1620.86,0.0 +gfx950,256,8192,200,5120,torch.int8,21,0,25.2734,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,663.83,1829.74,0.0 +gfx950,256,8192,800,5120,torch.int8,41,0,65.9722,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,1017.23,896.53,0.0 +gfx950,256,8192,1280,8192,torch.int8,4,0,95.3807,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1801.19,1033.4,0.0 +gfx950,256,8192,1280,8192,torch.float8_e4m3fn,4,0,81.7243,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2102.17,1206.08,0.0 +gfx950,256,8192,2304,16384,torch.int8,15,0,463.8213,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1333.43,452.15,0.0 +gfx950,256,8192,2560,8192,torch.int8,4,0,173.6356,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1978.84,748.83,0.0 +gfx950,256,8192,4608,16384,torch.int8,39,0,844.5637,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1464.6,337.7,0.0 +gfx950,256,8192,5120,640,torch.int8,42,0,55.1814,a8w8_rowwise_256x128x256x64_32x32_2x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,972.92,1674.58,0.0 +gfx950,256,8192,5120,1280,torch.int8,39,0,86.4091,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1242.63,1168.0,0.0 +gfx950,256,8192,5120,3200,torch.int8,40,0,149.3935,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1796.83,846.65,0.0 +gfx950,256,8192,5120,5120,torch.int8,39,0,277.9848,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1545.04,546.95,0.0 +gfx950,256,8192,5120,6400,torch.int8,40,0,276.3348,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1942.83,611.88,0.0 +gfx950,256,8192,5120,25600,torch.int8,39,0,1513.1737,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1419.19,280.65,0.0 +gfx950,256,8192,6400,5120,torch.int8,39,0,359.0195,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1495.38,500.16,0.0 +gfx950,256,8192,7168,8192,torch.int8,2,0,420.8889,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2285.81,577.99,0.0 +gfx950,256,8192,7424,8192,torch.float8_e4m3fn,1,0,441.2013,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2258.45,565.64,0.0 +gfx950,256,8192,8192,1024,torch.int8,1,0,105.9654,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1297.02,1424.95,0.0 +gfx950,256,8192,8192,1024,torch.float8_e4m3fn,10,0,119.8588,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1146.67,1259.77,0.0 +gfx950,256,8192,8192,2048,torch.int8,1,0,159.9883,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1718.11,1048.65,0.0 +gfx950,256,8192,8192,3584,torch.int8,1,0,232.9396,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2065.07,828.27,0.0 +gfx950,256,8192,8192,7168,torch.int8,1,0,412.978,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2329.6,609.37,0.0 +gfx950,256,8192,8192,8192,torch.int8,1,0,469.2882,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2342.93,572.01,0.0 +gfx950,256,8192,8192,28672,torch.int8,1,0,1532.6908,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2510.81,394.06,0.0 +gfx950,256,8192,9216,16384,torch.int8,43,0,1557.3892,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,1588.49,280.09,0.0 +gfx950,256,8192,10240,8192,torch.int8,1,0,575.9516,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2386.29,553.46,0.0 +gfx950,256,8192,12800,5120,torch.int8,39,0,683.3672,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1571.25,464.16,0.0 +gfx950,256,8192,13312,16384,torch.int8,40,0,2189.1084,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1632.36,260.57,0.0 +gfx950,256,8192,16384,2048,torch.int8,39,0,392.4492,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1400.83,812.25,0.0 +gfx950,256,8192,16384,4096,torch.int8,39,0,678.4652,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1620.59,544.02,0.0 +gfx950,256,8192,16384,6656,torch.int8,39,0,1094.0621,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1633.09,394.87,0.0 +gfx950,256,8192,16384,8192,torch.int8,1,0,1288.1354,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1707.14,364.68,0.0 +gfx950,256,8192,16384,13312,torch.int8,1,0,2091.5748,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1708.48,284.76,0.0 +gfx950,256,8192,16384,26624,torch.int8,1,0,4736.0103,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1509.04,194.84,0.0 +gfx950,256,8192,26624,16384,torch.int8,40,0,4074.1172,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1754.2,247.08,0.0 +gfx950,256,8192,51200,5120,torch.int8,1,0,2639.7599,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1627.03,432.97,0.0 +gfx950,256,8192,53248,16384,torch.int8,40,0,8028.0784,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1780.46,234.06,0.0 +gfx950,256,8192,57344,8192,torch.int8,1,0,3164.0659,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2432.5,466.61,0.0 +gfx950,256,16384,100,5120,torch.int8,27,0,42.6746,a8w8_rowwise_256x32x160x256_16x16_1x5_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3,393.14,2054.5,0.0 +gfx950,256,16384,200,5120,torch.int8,21,0,44.0772,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,761.27,2075.08,0.0 +gfx950,256,16384,800,5120,torch.int8,1,0,111.2384,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1206.58,1026.59,0.0 +gfx950,256,16384,1280,8192,torch.int8,4,0,178.1119,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1929.11,1047.92,0.0 +gfx950,256,16384,1280,8192,torch.float8_e4m3fn,4,0,174.2513,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1971.85,1071.13,0.0 +gfx950,256,16384,2304,16384,torch.int8,15,0,878.7062,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1407.7,434.37,0.0 +gfx950,256,16384,2560,8192,torch.int8,1,0,329.1334,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2087.89,726.38,0.0 +gfx950,256,16384,4608,16384,torch.int8,40,0,1632.4516,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1515.45,303.18,0.0 +gfx950,256,16384,5120,640,torch.int8,43,0,99.8836,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,1074.99,1817.46,0.0 +gfx950,256,16384,5120,1280,torch.int8,1,0,153.6015,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1398.09,1271.45,0.0 +gfx950,256,16384,5120,3200,torch.int8,1,0,271.2735,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1979.08,872.13,0.0 +gfx950,256,16384,5120,5120,torch.int8,1,0,518.1604,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1657.78,536.27,0.0 +gfx950,256,16384,5120,6400,torch.int8,1,0,543.7765,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1974.6,561.62,0.0 +gfx950,256,16384,5120,25600,torch.int8,1,0,2869.8056,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1496.61,250.29,0.0 +gfx950,256,16384,6400,5120,torch.int8,40,0,690.0605,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1556.01,472.96,0.0 +gfx950,256,16384,7168,8192,torch.int8,1,0,815.6995,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2358.89,524.48,0.0 +gfx950,256,16384,8192,1024,torch.int8,10,0,198.2601,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1386.45,1480.89,0.0 +gfx950,256,16384,8192,1024,torch.float8_e4m3fn,10,0,221.8588,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1238.98,1323.37,0.0 +gfx950,256,16384,8192,2048,torch.int8,1,0,299.7045,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1834.33,1063.6,0.0 +gfx950,256,16384,8192,3584,torch.int8,1,0,463.2471,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2076.8,769.6,0.0 +gfx950,256,16384,8192,7168,torch.int8,1,0,819.3844,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2348.28,542.6,0.0 +gfx950,256,16384,8192,8192,torch.int8,1,0,933.9446,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2354.55,502.99,0.0 +gfx950,256,16384,8192,28672,torch.int8,1,0,3088.0838,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2492.35,315.11,0.0 +gfx950,256,16384,9216,16384,torch.int8,1,0,2949.0967,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1677.73,244.62,0.0 +gfx950,256,16384,10240,8192,torch.int8,1,0,1149.3883,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2391.51,481.69,0.0 +gfx950,256,16384,12800,5120,torch.int8,39,0,1303.8826,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1646.99,436.28,0.0 +gfx950,256,16384,13312,16384,torch.int8,1,0,4134.6494,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1728.52,223.17,0.0 +gfx950,256,16384,16384,2048,torch.int8,1,0,719.1952,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1528.81,839.8,0.0 +gfx950,256,16384,16384,4096,torch.int8,39,0,1327.6342,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1656.35,505.48,0.0 +gfx950,256,16384,16384,6656,torch.int8,39,0,2120.7951,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1684.94,355.99,0.0 +gfx950,256,16384,16384,8192,torch.int8,1,0,2555.2385,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1721.19,315.16,0.0 +gfx950,256,16384,16384,13312,torch.int8,1,0,4157.4955,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1719.02,234.05,0.0 +gfx950,256,16384,16384,26624,torch.int8,1,0,9368.503,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1525.71,150.43,0.0 +gfx950,256,16384,26624,16384,torch.int8,40,0,8090.2903,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1766.77,194.93,0.0 +gfx950,256,16384,51200,5120,torch.int8,1,0,5290.8019,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1623.56,382.5,0.0 +gfx950,256,16384,53248,16384,torch.int8,40,0,15996.8357,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1787.06,180.39,0.0 +gfx950,256,16384,57344,8192,torch.int8,1,0,6467.762,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2379.98,383.91,0.0 +gfx950,256,32768,100,5120,torch.int8,27,0,78.5325,a8w8_rowwise_256x32x160x256_16x16_1x5_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3,427.27,2226.31,0.0 +gfx950,256,32768,200,5120,torch.int8,21,0,82.0643,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,817.76,2216.6,0.0 +gfx950,256,32768,800,5120,torch.int8,1,0,216.0528,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1242.45,1038.16,0.0 +gfx950,256,32768,1280,8192,torch.int8,1,0,339.8668,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2021.95,1067.5,0.0 +gfx950,256,32768,2304,16384,torch.int8,1,0,1717.9546,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1440.03,422.37,0.0 +gfx950,256,32768,2560,8192,torch.int8,1,0,599.612,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2292.13,762.46,0.0 +gfx950,256,32768,4608,16384,torch.int8,40,0,3026.9648,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1634.58,302.07,0.0 +gfx950,256,32768,5120,640,torch.int8,43,0,191.0117,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1,1124.27,1883.62,0.0 +gfx950,256,32768,5120,1280,torch.int8,1,0,299.0856,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1436.03,1284.05,0.0 +gfx950,256,32768,5120,3200,torch.int8,1,0,538.4921,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1993.98,848.27,0.0 +gfx950,256,32768,5120,5120,torch.int8,40,0,999.0598,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1719.6,530.03,0.0 +gfx950,256,32768,5120,6400,torch.int8,1,0,1028.2651,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2088.45,562.14,0.0 +gfx950,256,32768,5120,25600,torch.int8,1,0,5734.5832,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1497.92,227.65,0.0 +gfx950,256,32768,6400,5120,torch.int8,40,0,1274.1106,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1685.48,486.59,0.0 +gfx950,256,32768,7168,8192,torch.int8,1,0,1620.6254,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2374.57,491.73,0.0 +gfx950,256,32768,8192,1024,torch.int8,10,0,379.9256,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1447.01,1523.49,0.0 +gfx950,256,32768,8192,2048,torch.int8,1,0,594.1387,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1850.6,1044.8,0.0 +gfx950,256,32768,8192,3584,torch.int8,1,0,899.1908,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2139.86,760.32,0.0 +gfx950,256,32768,8192,7168,torch.int8,1,0,1640.4404,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2345.89,506.25,0.0 +gfx950,256,32768,8192,8192,torch.int8,1,0,1837.7153,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2393.21,474.73,0.0 +gfx950,256,32768,8192,28672,torch.int8,1,0,6209.7387,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2478.87,275.58,0.0 +gfx950,256,32768,9216,16384,torch.int8,1,0,5884.19,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1681.73,219.55,0.0 +gfx950,256,32768,10240,8192,torch.int8,1,0,2299.333,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2390.94,445.09,0.0 +gfx950,256,32768,12800,5120,torch.int8,39,0,2481.3683,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1730.89,432.09,0.0 +gfx950,256,32768,13312,16384,torch.int8,40,0,8230.5195,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1736.66,197.73,0.0 +gfx950,256,32768,16384,2048,torch.int8,1,0,1409.7173,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1559.9,833.08,0.0 +gfx950,256,32768,16384,4096,torch.int8,39,0,2546.0869,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1727.37,500.8,0.0 +gfx950,256,32768,16384,6656,torch.int8,39,0,4073.3594,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1754.53,343.92,0.0 +gfx950,256,32768,16384,8192,torch.int8,39,0,4965.434,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1771.47,297.33,0.0 +gfx950,256,32768,16384,13312,torch.int8,1,0,8271.2663,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1728.11,208.92,0.0 +gfx950,256,32768,16384,26624,torch.int8,1,0,19064.3766,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1499.51,124.96,0.0 +gfx950,256,32768,26624,16384,torch.int8,40,0,16029.2644,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1783.44,169.56,0.0 +gfx950,256,32768,51200,5120,torch.int8,1,0,10610.0611,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1619.21,356.77,0.0 +gfx950,256,32768,53248,16384,torch.int8,40,0,31823.1958,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1796.63,153.94,0.0 +gfx950,256,32768,57344,8192,torch.int8,1,0,12840.4952,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,2397.6,350.17,0.0 +gfx950,256,35200,256,19968,torch.float8_e4m3fn,1,0,245.3664,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,1466.67,2958.87,0.0 diff --git a/aiter/configs/bf16_tuned_batched_gemm.csv b/aiter/configs/bf16_tuned_batched_gemm.csv index 49fe6afdc6..79c20021b0 100644 --- a/aiter/configs/bf16_tuned_batched_gemm.csv +++ b/aiter/configs/bf16_tuned_batched_gemm.csv @@ -1,27 +1,27 @@ -cu_num,B,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio -304,16,1,1280,8192,78,0,111.9323,bf16_batched_64x16x16x64_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,3.0,3000.45,0.0 -304,16,32,1280,8192,29,0,123.5316,bf16_batched_256x32x96x128_16x16_1x3_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3,86.92,2794.78,0.0 -304,16,64,1280,8192,21,0,133.0927,bf16_batched_256x64x128x128_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,161.35,2666.89,0.0 -304,16,128,1280,8192,41,0,197.9104,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,217.02,1891.47,0.0 -304,16,192,1280,8192,11,0,231.4153,bf16_batched_256x128x160x64_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,278.39,1701.44,0.0 -304,16,256,1280,8192,11,0,256.7255,bf16_batched_256x128x160x64_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,334.6,1609.26,0.0 -304,16,320,1280,8192,13,0,329.4822,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,325.89,1312.78,0.0 -304,16,512,1280,8192,11,0,461.2279,bf16_batched_256x128x160x64_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,372.48,1063.97,0.0 -304,16,1024,1280,8192,41,0,773.6965,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,444.1,834.85,0.0 -304,16,2048,1280,8192,41,0,1381.6015,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,497.39,692.17,0.0 -304,16,4096,1280,8192,41,0,2666.4169,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,515.44,591.45,0.0 -304,16,8192,1280,8192,41,0,5488.8801,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,500.79,513.51,0.0 -304,16,16384,1280,8192,41,0,11248.3376,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,488.74,471.32,0.0 -304,16,1,8192,1024,79,0,75.979,bf16_batched_128x16x32x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,3.53,3536.9,0.0 -304,16,32,8192,1024,30,0,84.2878,bf16_batched_256x32x64x256_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,101.91,3296.71,0.0 -304,16,64,8192,1024,46,0,98.1849,bf16_batched_256x64x128x64_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,174.97,2926.21,0.0 -304,16,128,8192,1024,13,0,125.0599,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,274.75,2448.3,0.0 -304,16,192,8192,1024,13,0,169.3914,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,304.26,1918.98,0.0 -304,16,256,8192,1024,13,0,182.1985,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,377.17,1887.68,0.0 -304,16,320,8192,1024,13,0,245.1626,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,350.38,1479.86,0.0 -304,16,512,8192,1024,13,0,322.0123,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,426.81,1302.53,0.0 -304,16,1024,8192,1024,13,0,594.1558,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,462.64,960.06,0.0 -304,16,2048,8192,1024,0,0,1133.4458,bf16_batched_256x256x256x32_32x32_4x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_intrawave_v4,485.03,769.7,0.0 -304,16,4096,8192,1024,0,0,2168.7724,bf16_batched_256x256x256x32_32x32_4x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_intrawave_v4,506.97,680.75,0.0 -304,16,8192,8192,1024,13,0,4318.3042,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,509.23,621.62,0.0 -304,16,16384,8192,1024,0,0,8164.1629,bf16_batched_256x256x256x32_32x32_4x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_intrawave_v4,538.7,624.71,0.0 \ No newline at end of file +gfx,cu_num,B,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx942,304,16,1,1280,8192,78,0,111.9323,bf16_batched_64x16x16x64_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,3.0,3000.45,0.0 +gfx942,304,16,32,1280,8192,29,0,123.5316,bf16_batched_256x32x96x128_16x16_1x3_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3,86.92,2794.78,0.0 +gfx942,304,16,64,1280,8192,21,0,133.0927,bf16_batched_256x64x128x128_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,161.35,2666.89,0.0 +gfx942,304,16,128,1280,8192,41,0,197.9104,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,217.02,1891.47,0.0 +gfx942,304,16,192,1280,8192,11,0,231.4153,bf16_batched_256x128x160x64_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,278.39,1701.44,0.0 +gfx942,304,16,256,1280,8192,11,0,256.7255,bf16_batched_256x128x160x64_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,334.6,1609.26,0.0 +gfx942,304,16,320,1280,8192,13,0,329.4822,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,325.89,1312.78,0.0 +gfx942,304,16,512,1280,8192,11,0,461.2279,bf16_batched_256x128x160x64_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3,372.48,1063.97,0.0 +gfx942,304,16,1024,1280,8192,41,0,773.6965,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,444.1,834.85,0.0 +gfx942,304,16,2048,1280,8192,41,0,1381.6015,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,497.39,692.17,0.0 +gfx942,304,16,4096,1280,8192,41,0,2666.4169,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,515.44,591.45,0.0 +gfx942,304,16,8192,1280,8192,41,0,5488.8801,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,500.79,513.51,0.0 +gfx942,304,16,16384,1280,8192,41,0,11248.3376,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5,488.74,471.32,0.0 +gfx942,304,16,1,8192,1024,79,0,75.979,bf16_batched_128x16x32x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,3.53,3536.9,0.0 +gfx942,304,16,32,8192,1024,30,0,84.2878,bf16_batched_256x32x64x256_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,101.91,3296.71,0.0 +gfx942,304,16,64,8192,1024,46,0,98.1849,bf16_batched_256x64x128x64_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,174.97,2926.21,0.0 +gfx942,304,16,128,8192,1024,13,0,125.0599,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,274.75,2448.3,0.0 +gfx942,304,16,192,8192,1024,13,0,169.3914,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,304.26,1918.98,0.0 +gfx942,304,16,256,8192,1024,13,0,182.1985,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,377.17,1887.68,0.0 +gfx942,304,16,320,8192,1024,13,0,245.1626,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,350.38,1479.86,0.0 +gfx942,304,16,512,8192,1024,13,0,322.0123,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,426.81,1302.53,0.0 +gfx942,304,16,1024,8192,1024,13,0,594.1558,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,462.64,960.06,0.0 +gfx942,304,16,2048,8192,1024,0,0,1133.4458,bf16_batched_256x256x256x32_32x32_4x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_intrawave_v4,485.03,769.7,0.0 +gfx942,304,16,4096,8192,1024,0,0,2168.7724,bf16_batched_256x256x256x32_32x32_4x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_intrawave_v4,506.97,680.75,0.0 +gfx942,304,16,8192,8192,1024,13,0,4318.3042,bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3,509.23,621.62,0.0 +gfx942,304,16,16384,8192,1024,0,0,8164.1629,bf16_batched_256x256x256x32_32x32_4x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_intrawave_v4,538.7,624.71,0.0 diff --git a/aiter/configs/bf16_tuned_gemm.csv b/aiter/configs/bf16_tuned_gemm.csv index 6c7b483c5a..60bbf06a70 100644 --- a/aiter/configs/bf16_tuned_gemm.csv +++ b/aiter/configs/bf16_tuned_gemm.csv @@ -1 +1 @@ -cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +gfx,cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw diff --git a/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_dsv3.csv b/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_dsv3.csv index 91e1c3ffbe..70848d78d9 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_dsv3.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_dsv3.csv @@ -1,721 +1,721 @@ -cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,16,128,7168,asm,6,7,6.1485,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,4.78,168.54,0.0073 -256,32,128,7168,asm,7,8,6.3047,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,9.31,183.21,0.0012 -256,48,128,7168,asm,6,7,6.5243,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,13.5,195.25,0.0073 -256,64,128,7168,asm,7,8,6.337,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,18.53,219.76,0.0098 -256,80,128,7168,asm,7,8,8.0249,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,18.29,188.34,0.0045 -256,96,128,7168,asm,15,8,8.8365,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,19.94,184.49,0.0107 -256,112,128,7168,asm,6,7,7.6026,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,27.03,230.05,0.0042 -256,128,128,7168,asm,6,7,8.8728,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,26.47,210.51,0.0092 -256,144,128,7168,asm,7,8,8.0541,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,32.81,246.65,0.0111 -256,160,128,7168,asm,7,8,8.4521,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,34.74,249.09,0.0076 -256,176,128,7168,asm,23,8,7.9649,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,40.55,279.24,0.0048 -256,192,128,7168,asm,7,8,8.7548,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,40.24,267.61,0.0093 -256,208,128,7168,asm,7,8,7.5139,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,50.8,327.62,0.0117 -256,224,128,7168,asm,30,7,9.2902,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,44.24,277.76,0.0085 -256,240,128,7168,asm,7,8,9.2722,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,47.5,291.11,0.0119 -256,256,128,7168,asm,15,8,9.4713,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,49.6,297.54,0.0047 -256,288,128,7168,asm,15,8,8.3144,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,63.56,367.51,0.0025 -256,320,128,7168,asm,7,8,9.8117,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,59.85,335.64,0.0078 -256,352,128,7168,asm,7,8,9.4965,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,68.02,371.8,0.0046 -256,384,128,7168,asm,7,8,10.0883,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,69.85,373.53,0.0139 -256,416,128,7168,asm,7,8,10.1166,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,75.46,395.97,0.0081 -256,448,128,7168,asm,5,6,10.0989,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,81.4,420.19,0.0065 -256,480,128,7168,asm,31,8,9.5986,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,91.76,466.84,0.0125 -256,512,128,7168,asm,7,8,10.208,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,92.04,462.24,0.0076 -256,544,128,7168,asm,7,8,9.4877,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,105.21,522.38,0.0042 -256,576,128,7168,asm,7,8,9.4869,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,111.41,547.46,0.0102 -256,608,128,7168,asm,7,8,10.3885,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,107.4,522.82,0.0071 -256,640,128,7168,asm,7,8,9.5917,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,122.44,591.02,0.0014 -256,672,128,7168,asm,7,8,9.8577,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,125.09,599.17,0.005 -256,704,128,7168,asm,7,8,10.3689,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,124.59,592.54,0.0098 -256,736,128,7168,asm,7,8,10.5103,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,128.5,607.17,0.0015 -256,768,128,7168,asm,15,8,10.7333,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,131.3,616.69,0.002 -256,800,128,7168,asm,15,8,10.554,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,139.09,649.68,0.0109 -256,832,128,7168,asm,7,8,9.7953,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,155.86,724.25,0.0094 -256,864,128,7168,asm,7,8,10.7375,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,147.66,682.83,0.0101 -256,896,128,7168,asm,7,8,10.8442,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,151.62,698.01,0.0089 -256,928,128,7168,asm,15,8,11.0012,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,154.79,709.65,0.0108 -256,960,128,7168,asm,7,8,10.8867,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,161.81,738.93,0.0155 -256,992,128,7168,asm,7,8,11.0077,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,165.37,752.39,0.0145 -256,1024,128,7168,asm,7,8,9.6737,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,194.24,880.71,0.013 -256,1088,128,7168,asm,15,8,10.5533,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,189.18,852.32,0.0056 -256,1152,128,7168,asm,15,8,11.1072,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,190.32,852.6,0.0109 -256,1216,128,7168,asm,15,8,11.6079,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,192.23,856.75,0.0121 -256,1280,128,7168,asm,31,8,10.9477,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,214.55,951.82,0.0098 -256,1344,128,7168,asm,39,8,10.7369,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,229.7,1014.76,0.0017 -256,1408,128,7168,asm,15,8,10.2201,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,252.8,1112.56,0.009 -256,1472,128,7168,asm,14,7,11.8021,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,228.87,1003.69,0.0082 -256,1536,128,7168,asm,15,8,10.5213,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,267.89,1171.03,0.0109 -256,1600,128,7168,asm,22,7,12.0077,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,244.51,1065.64,0.0032 -256,1664,128,7168,asm,14,7,12.3669,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,246.91,1073.11,0.0067 -256,1728,128,7168,asm,31,8,12.5593,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,252.47,1094.5,0.0063 -256,1792,128,7168,asm,21,6,12.5025,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,263.01,1137.48,0.0064 -256,1856,128,7168,asm,13,6,12.1277,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,280.83,1211.81,0.0011 -256,1920,128,7168,asm,22,7,12.0941,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,291.32,1254.46,0.0058 -256,1984,128,7168,asm,29,6,12.2402,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,297.43,1278.31,0.0057 -256,2048,128,7168,asm,31,8,11.5875,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,324.32,1391.31,0.0105 -256,4096,128,7168,asm,18,3,16.9007,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,444.73,1853.55,0.0002 -256,8192,128,7168,ck,17,1,22.7549,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,660.62,2713.04,0.0 -256,16384,128,7168,asm,16,1,36.8278,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,816.36,3327.71,0.0 -256,32768,128,7168,asm,40,1,57.9434,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1037.73,4214.24,0.0 -256,16,2112,7168,ck,7,3,17.369,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,27.89,882.09,0.0 -256,32,2112,7168,ck,7,1,17.1099,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,56.63,906.1,0.0 -256,48,2112,7168,ck,7,3,17.0967,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,85.01,917.47,0.0 -256,64,2112,7168,ck,7,1,16.6867,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,116.13,950.93,0.0 -256,80,2112,7168,ck,7,1,17.0679,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,141.92,940.37,0.0 -256,96,2112,7168,ck,7,2,16.9001,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,171.99,960.49,0.0 -256,112,2112,7168,ck,7,1,16.7774,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,202.12,978.38,0.0 -256,128,2112,7168,ck,7,1,16.8979,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,229.35,982.19,0.0 -256,144,2112,7168,ck,7,0,17.4362,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,250.05,962.32,0.0 -256,160,2112,7168,ck,12,3,18.1582,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,266.79,934.1,0.0 -256,176,2112,7168,ck,7,2,19.2838,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,276.34,889.03,0.0 -256,192,2112,7168,ck,12,2,18.0709,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,321.69,958.78,0.0 -256,208,2112,7168,ck,7,2,19.4853,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,323.21,898.54,0.0 -256,224,2112,7168,ck,12,1,18.1425,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,373.83,975.09,0.0 -256,240,2112,7168,ck,7,1,20.222,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,359.34,883.83,0.0 -256,256,2112,7168,ck,12,3,19.3304,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,400.98,934.03,0.0 -256,288,2112,7168,ck,12,0,19.761,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,441.27,932.12,0.0 -256,320,2112,7168,ck,17,1,21.3295,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,454.25,880.67,0.0 -256,352,2112,7168,ck,17,0,22.1299,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,481.6,865.29,0.0 -256,384,2112,7168,ck,17,3,21.2294,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,547.67,919.17,0.0 -256,416,2112,7168,ck,17,2,22.3197,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,564.32,890.6,0.0 -256,448,2112,7168,ck,17,3,21.3596,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,635.05,947.7,0.0 -256,480,2112,7168,ck,12,1,23.7782,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,611.2,866.63,0.0 -256,512,2112,7168,ck,17,1,28.0778,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,552.11,746.91,0.0 -256,544,2112,7168,ck,17,0,29.6478,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,555.56,719.65,0.0 -256,576,2112,7168,ck,17,0,28.8581,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,604.33,751.98,0.0 -256,608,2112,7168,ck,12,0,30.6611,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,600.4,719.65,0.0 -256,640,2112,7168,ck,10,2,30.8536,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,628.05,726.97,0.0201 -256,672,2112,7168,ck,17,0,32.1211,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,633.43,709.63,0.0 -256,704,2112,7168,ck,17,0,31.5718,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,675.14,733.53,0.0 -256,736,2112,7168,ck,17,1,31.4913,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,707.63,746.98,0.0 -256,768,2112,7168,ck,17,3,31.0549,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,748.78,769.21,0.0 -256,800,2112,7168,ck,17,3,32.8542,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,737.26,738.18,0.0 -256,832,2112,7168,ck,17,0,32.1611,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,783.28,765.43,0.0 -256,864,2112,7168,ck,17,0,33.1831,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,788.35,752.84,0.0 -256,896,2112,7168,ck,17,3,32.5403,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,833.7,778.91,0.0 -256,928,2112,7168,ck,17,1,34.0573,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,825.01,754.92,0.0 -256,960,2112,7168,ck,17,0,33.349,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,871.59,781.89,0.0 -256,992,2112,7168,ck,15,2,41.9166,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,716.55,630.77,0.0175 -256,1024,2112,7168,ck,15,2,41.7085,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,743.36,642.66,0.0022 -256,1088,2112,7168,ck,15,1,40.9809,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,803.84,671.86,0.0036 -256,1152,2112,7168,ck,15,0,40.8235,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,854.41,692.31,0.0036 -256,1216,2112,7168,ck,15,3,42.2342,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,871.75,686.45,0.0044 -256,1280,2112,7168,ck,15,1,43.6734,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,887.39,680.52,0.0042 -256,1344,2112,7168,ck,15,1,45.0358,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,903.57,676.12,0.0055 -256,1408,2112,7168,ck,15,0,46.1981,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,922.78,674.89,0.0058 -256,1472,2112,7168,ck,15,0,46.921,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,949.87,680.03,0.0064 -256,1536,2112,7168,ck,17,0,55.8902,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,832.1,583.95,0.0 -256,1600,2112,7168,ck,15,2,56.4283,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,858.51,591.3,0.0067 -256,1664,2112,7168,ck,2,2,57.368,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,878.22,594.32,0.0004 -256,1728,2112,7168,ck,15,1,57.7674,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,905.7,602.83,0.0078 -256,1792,2112,7168,ck,2,1,59.3469,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,914.24,599.08,0.0006 -256,1856,2112,7168,ck,15,3,60.2407,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,932.85,602.29,0.0081 -256,1920,2112,7168,ck,15,1,62.5154,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,929.9,592.04,0.008 -256,1984,2112,7168,ck,15,3,64.5853,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,930.1,584.35,0.0082 -256,2048,2112,7168,ck,17,1,73.8185,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,840.01,521.14,0.0 -256,4096,2112,7168,ck,15,0,122.5407,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1012.05,504.33,0.0085 -256,8192,2112,7168,ck,15,3,223.4002,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1110.27,485.51,0.0091 -256,16384,2112,7168,ck,15,1,411.1387,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1206.57,490.8,0.0095 -256,32768,2112,7168,ck,17,0,804.4843,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,1233.26,482.83,0.0 -256,16,2240,7168,ck,7,2,17.3187,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,29.67,937.87,0.0 -256,32,2240,7168,ck,7,1,17.2052,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,59.73,954.89,0.0 -256,48,2240,7168,ck,7,0,17.1856,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,89.69,966.82,0.0 -256,64,2240,7168,ck,7,2,17.1082,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,120.13,982.09,0.0 -256,80,2240,7168,ck,7,1,17.1957,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,149.4,987.93,0.0 -256,96,2240,7168,ck,7,0,17.0763,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,180.53,1005.75,0.0 -256,112,2240,7168,ck,7,2,17.0073,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,211.47,1020.79,0.0 -256,128,2240,7168,ck,12,3,18.3162,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,224.41,958.02,0.0 -256,144,2240,7168,ck,7,1,17.4925,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,264.35,1013.78,0.0 -256,160,2240,7168,ck,12,0,18.2215,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,281.98,983.45,0.0 -256,176,2240,7168,ck,7,0,19.5377,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,289.28,926.74,0.0 -256,192,2240,7168,ck,12,3,18.244,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,337.95,1002.67,0.0 -256,208,2240,7168,ck,7,1,19.7375,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,338.41,936.24,0.0 -256,224,2240,7168,ck,12,3,18.0754,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,397.96,1032.65,0.0 -256,240,2240,7168,ck,17,1,22.0355,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,349.76,855.52,0.0 -256,256,2240,7168,ck,17,1,21.1511,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,388.67,900.1,0.0 -256,288,2240,7168,ck,12,0,19.8488,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,465.94,977.94,0.0 -256,320,2240,7168,ck,17,2,21.2632,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,483.28,930.42,0.0 -256,352,2240,7168,ck,17,1,22.208,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,508.99,907.62,0.0 -256,384,2240,7168,ck,17,2,21.4024,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,576.16,959.2,0.0 -256,416,2240,7168,ck,17,1,22.4706,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,594.5,930.19,0.0 -256,448,2240,7168,ck,17,2,21.5077,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,668.9,989.16,0.0 -256,480,2240,7168,ck,10,3,29.6061,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,520.64,731.18,0.0197 -256,512,2240,7168,ck,15,2,31.3316,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,524.76,702.81,0.0005 -256,544,2240,7168,ck,17,2,29.8347,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,585.54,750.56,0.0 -256,576,2240,7168,ck,17,0,29.2116,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,633.2,779.33,0.0 -256,608,2240,7168,ck,12,3,31.1053,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,627.69,743.87,0.0 -256,640,2240,7168,ck,17,3,31.3436,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,655.7,750.11,0.0 -256,672,2240,7168,ck,17,2,32.4726,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,664.55,735.51,0.0 -256,704,2240,7168,ck,17,3,31.5542,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,716.46,768.73,0.0 -256,736,2240,7168,ck,17,1,32.7615,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,721.42,751.77,0.0 -256,768,2240,7168,ck,17,0,31.996,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,770.8,781.41,0.0 -256,800,2240,7168,ck,17,2,33.3508,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,770.3,760.84,0.0 -256,832,2240,7168,ck,17,1,32.6843,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,817.45,787.76,0.0 -256,864,2240,7168,ck,17,2,34.2322,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,810.5,763.03,0.0 -256,896,2240,7168,ck,17,1,33.0197,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,871.39,802.34,0.0 -256,928,2240,7168,ck,15,2,41.585,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,716.62,646.04,0.0173 -256,960,2240,7168,ck,15,1,39.9713,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,771.26,681.45,0.0025 -256,992,2240,7168,ck,15,1,43.5147,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,732.07,634.52,0.0179 -256,1024,2240,7168,ck,2,2,44.5432,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,738.23,628.24,0.0001 -256,1088,2240,7168,ck,15,2,41.8467,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,834.92,686.54,0.0035 -256,1152,2240,7168,ck,15,1,42.3981,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,872.53,695.19,0.0043 -256,1216,2240,7168,ck,15,1,44.3198,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,881.07,681.87,0.0047 -256,1280,2240,7168,ck,15,0,46.0014,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,893.54,673.15,0.0057 -256,1344,2240,7168,ck,15,3,46.7258,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,923.67,678.67,0.006 -256,1408,2240,7168,ck,15,3,55.5671,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,813.69,584.1,0.0061 -256,1472,2240,7168,ck,17,0,56.1099,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,842.45,591.73,0.0 -256,1536,2240,7168,ck,15,0,57.0356,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,864.81,595.2,0.0071 -256,1600,2240,7168,ck,15,0,57.9207,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,887.08,598.98,0.0072 -256,1664,2240,7168,ck,15,0,58.3656,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,915.53,607.18,0.008 -256,1728,2240,7168,ck,15,0,59.5866,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,931.26,607.25,0.0078 -256,1792,2240,7168,ck,15,1,60.5927,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,949.72,609.47,0.0083 -256,1856,2240,7168,ck,17,1,62.6283,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,951.66,601.57,0.0 -256,1920,2240,7168,ck,17,2,69.6783,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,884.87,551.4,0.0 -256,1984,2240,7168,ck,15,0,69.3874,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,918.2,564.45,0.0079 -256,2048,2240,7168,ck,17,0,75.0354,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,876.48,531.9,0.0 -256,4096,2240,7168,ck,15,0,125.1988,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1050.6,509.32,0.0092 -256,8192,2240,7168,ck,15,0,228.9556,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1148.99,486.89,0.0096 -256,16384,2240,7168,ck,15,3,434.2124,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1211.7,476.49,0.0095 -256,32768,2240,7168,ck,15,2,834.7736,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1260.54,476.46,0.0099 -256,16,3072,1536,ck,7,3,6.4633,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.36,749.07,0.0 -256,16,3072,3072,asm,5,6,7.3727,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,40.96,1300.02,0.0003 -256,32,3072,1536,ck,7,3,6.5001,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.46,763.73,0.0 -256,32,3072,3072,asm,15,8,8.2748,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,72.99,1176.11,0.0006 -256,48,3072,1536,ck,7,0,6.5182,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,69.5,780.47,0.0 -256,48,3072,3072,asm,4,5,9.5269,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,95.1,1037.02,0.0001 -256,64,3072,1536,ck,6,3,6.4157,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,94.14,812.09,0.0 -256,64,3072,3072,asm,3,4,9.2705,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,130.3,1081.6,0.0001 -256,80,3072,1536,ck,7,1,6.2015,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,121.74,859.95,0.0 -256,80,3072,3072,asm,12,5,9.6381,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,156.66,1055.65,0.0001 -256,96,3072,1536,ck,7,1,6.1725,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,146.78,883.9,0.0 -256,96,3072,3072,asm,19,4,9.2652,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,195.56,1114.05,0.0001 -256,112,3072,1536,ck,7,3,6.5959,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,160.25,845.79,0.0 -256,112,3072,3072,ck,7,0,9.9923,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,211.56,1047.74,0.0 -256,128,3072,1536,ck,7,0,6.2613,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,192.92,910.61,0.0 -256,128,3072,3072,ck,7,0,10.3718,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,232.93,1023.62,0.0 -256,144,3072,1536,ck,12,1,6.338,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,214.41,918.98,0.0 -256,144,3072,3072,ck,7,0,10.2283,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,265.72,1052.4,0.0 -256,160,3072,1536,ck,12,0,6.0824,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,248.25,977.8,0.0 -256,160,3072,3072,ck,12,1,9.8728,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,305.88,1105.23,0.0 -256,176,3072,1536,ck,6,3,7.3433,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,226.19,826.64,0.0 -256,176,3072,3072,ck,12,1,11.0912,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,299.51,997.11,0.0 -256,192,3072,1536,ck,6,0,6.9211,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,261.8,894.82,0.0 -256,192,3072,3072,ck,12,3,10.5231,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,344.37,1064.96,0.0 -256,208,3072,1536,ck,12,1,7.2509,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,270.72,871.07,0.0 -256,208,3072,3072,ck,17,2,11.654,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,336.87,974.27,0.0 -256,224,3072,1536,ck,12,1,7.2036,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,293.45,893.85,0.0 -256,224,3072,3072,ck,12,3,11.5304,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,366.67,997.5,0.0 -256,240,3072,1536,ck,11,1,7.2241,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,313.52,908.32,0.0 -256,240,3072,3072,ck,17,3,11.735,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,386.01,992.67,0.0 -256,256,3072,1536,ck,11,3,7.3204,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,330.03,913.16,0.0 -256,256,3072,3072,ck,17,2,11.4432,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,422.25,1030.87,0.0 -256,288,3072,1536,ck,12,3,7.4073,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,366.92,935.62,0.0 -256,288,3072,3072,ck,12,0,11.9257,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,455.81,1013.89,0.0 -256,320,3072,1536,ck,17,0,7.4931,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,403.02,957.71,0.0 -256,320,3072,3072,ck,17,0,11.7025,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,516.11,1058.43,0.0 -256,352,3072,1536,ck,10,0,9.4059,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,353.17,789.07,0.0212 -256,352,3072,3072,asm,8,1,14.6095,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,454.76,868.01,0.0 -256,384,3072,1536,ck,12,3,9.3019,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,389.58,824.32,0.0 -256,384,3072,3072,asm,8,1,14.6577,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,494.47,885.28,0.0 -256,416,3072,1536,asm,8,1,9.7878,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,401.1,808.5,0.0 -256,416,3072,3072,asm,8,1,14.702,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,534.06,902.67,0.0 -256,448,3072,1536,ck,12,1,9.7709,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,432.7,835.05,0.0 -256,448,3072,3072,asm,8,1,14.7225,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,574.34,921.44,0.0 -256,480,3072,1536,ck,12,3,10.0889,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,448.99,833.09,0.0 -256,480,3072,3072,asm,8,1,14.8006,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,612.12,936.51,0.0 -256,512,3072,1536,ck,12,2,10.3534,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,466.69,835.55,0.0 -256,512,3072,3072,asm,16,1,15.8579,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,609.39,892.66,0.0 -256,544,3072,1536,ck,11,3,10.5019,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,488.85,847.13,0.0 -256,544,3072,3072,asm,16,1,15.8752,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,646.77,910.27,0.0 -256,576,3072,1536,ck,17,1,10.5455,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,515.46,866.94,0.0 -256,576,3072,3072,asm,16,1,15.9596,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,681.2,923.93,0.0 -256,608,3072,1536,ck,11,0,10.6701,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,537.75,879.84,0.0 -256,608,3072,3072,asm,16,1,16.0084,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,716.85,939.54,0.0 -256,640,3072,1536,ck,17,1,10.7885,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,559.84,892.97,0.0 -256,640,3072,3072,asm,16,1,16.2216,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,744.66,945.37,0.0 -256,672,3072,1536,ck,11,3,10.9913,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,576.98,898.85,0.0 -256,672,3072,3072,asm,24,1,17.2482,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,735.36,906.2,0.0 -256,704,3072,1536,asm,24,1,12.1215,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,548.1,835.32,0.0 -256,704,3072,3072,asm,24,1,17.5391,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,757.6,907.99,0.0 -256,736,3072,1536,asm,24,1,11.7277,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,592.25,884.32,0.0 -256,736,3072,3072,asm,24,1,17.5502,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,791.53,924.22,0.0 -256,768,3072,1536,asm,24,1,11.8225,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,613.05,898.02,0.0 -256,768,3072,3072,asm,24,1,17.4237,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,831.94,947.85,0.0 -256,800,3072,1536,asm,24,1,12.0428,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,626.91,902.0,0.0 -256,800,3072,3072,asm,24,1,17.7973,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,848.41,944.52,0.0 -256,832,3072,1536,asm,32,1,13.235,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,593.26,839.32,0.0 -256,832,3072,3072,asm,32,1,19.303,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,813.53,886.13,0.0 -256,864,3072,1536,asm,32,1,13.3818,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,609.31,848.47,0.0 -256,864,3072,3072,asm,32,1,19.4672,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,837.69,893.8,0.0 -256,896,3072,1536,asm,32,1,12.9966,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,650.61,892.53,0.0 -256,896,3072,3072,asm,32,1,19.4031,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,871.58,911.95,0.0 -256,928,3072,1536,asm,32,1,13.4375,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,651.74,881.54,0.0 -256,928,3072,3072,asm,32,1,19.6687,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,890.52,914.63,0.0 -256,960,3072,1536,asm,32,1,13.6336,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,664.51,886.88,0.0 -256,960,3072,3072,asm,32,1,19.8975,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,910.64,918.94,0.0 -256,992,3072,1536,ck,15,1,14.2672,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,656.17,864.72,0.0137 -256,992,3072,3072,ck,14,3,22.8957,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,817.77,811.48,0.0001 -256,1024,3072,1536,ck,15,3,14.2212,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,679.53,884.8,0.0014 -256,1024,3072,3072,ck,14,3,22.9132,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,843.5,823.73,0.0001 -256,1088,3072,1536,ck,14,0,14.3467,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,715.68,911.32,0.0 -256,1088,3072,3072,ck,14,3,22.4901,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,913.08,865.46,0.0001 -256,1152,3072,1536,ck,14,3,14.3806,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,755.99,943.35,0.0 -256,1152,3072,3072,ck,14,0,22.6081,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,961.75,887.03,0.0001 -256,1216,3072,1536,ck,14,3,14.6973,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,780.8,956.47,0.0 -256,1216,3072,3072,ck,14,2,23.0364,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,996.3,896.14,0.0001 -256,1280,3072,1536,ck,14,1,14.8836,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,811.6,977.52,0.0 -256,1280,3072,3072,ck,14,1,23.4477,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1030.34,905.58,0.0001 -256,1344,3072,1536,ck,14,3,15.1183,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,838.96,994.85,0.0 -256,1344,3072,3072,ck,14,0,23.8751,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1062.49,914.07,0.0001 -256,1408,3072,1536,ck,14,0,18.3176,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,725.4,847.93,0.0 -256,1408,3072,3072,ck,14,3,29.2608,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,908.22,765.98,0.0001 -256,1472,3072,1536,ck,14,1,18.5424,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,749.18,864.16,0.0001 -256,1472,3072,3072,ck,14,3,29.6273,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,937.75,776.42,0.0001 -256,1536,3072,1536,ck,14,0,18.6045,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,779.14,887.69,0.0001 -256,1536,3072,3072,ck,14,1,29.4698,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,983.75,800.58,0.0002 -256,1600,3072,1536,ck,14,2,18.8,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,803.16,904.61,0.0001 -256,1600,3072,3072,ck,14,3,29.7895,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1013.75,811.79,0.0002 -256,1664,3072,1536,ck,14,0,18.9387,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,829.17,923.93,0.0001 -256,1664,3072,3072,ck,14,0,30.0458,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1045.3,824.49,0.0001 -256,1728,3072,1536,ck,14,3,19.2012,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,849.29,936.9,0.0002 -256,1728,3072,3072,ck,14,2,30.4353,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1071.61,833.32,0.0002 -256,1792,3072,1536,ck,14,1,19.3448,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,874.21,955.36,0.0001 -256,1792,3072,3072,ck,14,0,30.8284,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1097.13,841.83,0.0002 -256,1856,3072,1536,ck,14,1,19.4996,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,898.24,972.98,0.0002 -256,1856,3072,3072,ck,14,0,31.8273,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1100.65,833.94,0.0002 -256,1920,3072,1536,ck,14,2,19.5725,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,925.76,994.47,0.0002 -256,1920,3072,3072,ck,14,0,31.7613,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1140.97,854.24,0.0002 -256,1984,3072,1536,ck,14,1,20.1808,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,927.78,988.85,0.0002 -256,1984,3072,3072,ck,14,3,33.9609,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1102.64,816.28,0.0002 -256,2048,3072,1536,ck,14,2,20.3677,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,948.92,1003.9,0.0001 -256,2048,3072,3072,ck,14,3,33.9279,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1139.32,834.46,0.0002 -256,4096,3072,1536,ck,13,0,32.8903,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1175.26,1099.89,0.0 -256,4096,3072,3072,ck,13,1,56.2027,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1375.55,839.57,0.0 -256,8192,3072,1536,ck,13,0,60.7603,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1272.37,1113.11,0.0 -256,8192,3072,3072,ck,13,0,98.9536,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1562.54,858.33,0.0 -256,16384,3072,1536,ck,13,2,113.3639,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1363.92,1151.58,0.0 -256,16384,3072,3072,ck,13,1,188.8317,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1637.64,849.6,0.0 -256,32768,3072,1536,ck,13,0,206.2023,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1499.68,1243.33,0.0 -256,32768,3072,3072,ck,13,1,358.9075,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1723.22,867.71,0.0 -256,16,4096,512,ck,7,3,3.0553,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.96,731.98,0.0 -256,16,4096,3072,asm,5,6,8.5265,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,47.22,1496.88,0.0001 -256,32,4096,512,ck,7,1,3.352,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.04,708.74,0.0 -256,32,4096,3072,asm,5,6,9.3323,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,86.29,1386.94,0.0001 -256,48,4096,512,ck,7,2,3.4163,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,58.93,736.16,0.0 -256,48,4096,3072,asm,13,6,9.3261,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,129.52,1407.19,0.0002 -256,64,4096,512,ck,7,0,3.5282,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,76.08,752.28,0.0 -256,64,4096,3072,ck,12,0,9.8624,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,163.31,1348.94,0.0 -256,80,4096,512,ck,7,1,3.9461,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,85.03,707.91,0.0 -256,80,4096,3072,asm,11,4,10.0921,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,199.49,1336.1,0.0001 -256,96,4096,512,ck,7,1,3.9483,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,101.98,742.78,0.0 -256,96,4096,3072,ck,12,0,9.741,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,248.02,1402.76,0.0 -256,112,4096,512,ck,6,1,4.0039,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,117.33,767.25,0.0 -256,112,4096,3072,ck,7,0,10.2261,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,275.63,1353.84,0.0 -256,128,4096,512,ck,7,0,3.933,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,136.5,816.49,0.0 -256,128,4096,3072,ck,12,1,9.764,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,329.91,1436.37,0.0 -256,144,4096,512,ck,7,1,4.6145,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,130.89,726.09,0.0 -256,144,4096,3072,ck,17,0,11.6095,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,312.15,1223.56,0.0 -256,160,4096,512,ck,7,0,4.7158,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,142.31,740.02,0.0 -256,160,4096,3072,ck,17,0,11.648,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,345.68,1234.99,0.0 -256,176,4096,512,ck,7,1,4.6429,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,158.99,781.64,0.0 -256,176,4096,3072,ck,17,1,11.723,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,377.82,1242.46,0.0 -256,192,4096,512,ck,7,0,4.6761,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,172.22,805.87,0.0 -256,192,4096,3072,ck,17,0,11.4303,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,422.72,1290.04,0.0 -256,208,4096,512,ck,11,2,4.762,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,183.2,820.58,0.0 -256,208,4096,3072,ck,17,2,11.7962,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,443.74,1265.31,0.0 -256,224,4096,512,ck,11,2,4.8073,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,195.44,841.81,0.0 -256,224,4096,3072,ck,17,3,11.8189,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,476.96,1278.13,0.0 -256,240,4096,512,ck,11,1,4.7521,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,211.83,880.9,0.0 -256,240,4096,3072,ck,17,1,11.8801,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,508.4,1286.71,0.0 -256,256,4096,512,ck,11,2,4.7337,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,226.83,913.74,0.0 -256,256,4096,3072,ck,17,2,11.5786,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,556.41,1335.78,0.0 -256,288,4096,512,ck,12,2,5.7243,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,211.02,804.27,0.0 -256,288,4096,3072,asm,8,1,14.4923,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,500.11,1092.09,0.0 -256,320,4096,512,ck,12,3,5.7755,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,232.39,845.37,0.0 -256,320,4096,3072,asm,8,1,14.6266,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,550.58,1106.71,0.0 -256,352,4096,512,ck,12,0,5.8203,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,253.66,886.72,0.0 -256,352,4096,3072,asm,8,1,14.722,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,601.71,1124.02,0.0 -256,384,4096,512,ck,12,0,5.8685,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,274.45,926.9,0.0 -256,384,4096,3072,asm,8,1,14.8001,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,652.95,1142.44,0.0 -256,416,4096,512,ck,7,0,6.0603,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,287.91,943.52,0.0 -256,416,4096,3072,asm,16,1,15.6767,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,667.81,1101.55,0.0 -256,448,4096,512,ck,15,0,6.0223,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,312.02,995.72,0.0001 -256,448,4096,3072,asm,16,1,15.7201,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,717.19,1121.44,0.0 -256,480,4096,512,ck,15,1,6.0166,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,334.62,1042.96,0.0122 -256,480,4096,3072,asm,16,1,15.7477,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,767.07,1142.37,0.0 -256,512,4096,512,ck,9,2,6.0597,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,354.39,1081.51,0.0 -256,512,4096,3072,asm,16,1,15.937,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,808.49,1151.41,0.0 -256,544,4096,512,ck,11,2,6.3309,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,360.41,1079.17,0.0 -256,544,4096,3072,asm,24,1,17.1008,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,800.56,1094.13,0.0 -256,576,4096,512,ck,9,3,6.3106,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,382.84,1126.78,0.0 -256,576,4096,3072,asm,24,1,17.4405,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,831.14,1093.49,0.0 -256,608,4096,512,ck,9,3,6.3107,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,404.1,1170.9,0.0 -256,608,4096,3072,asm,24,1,17.2655,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,886.21,1125.45,0.0 -256,640,4096,512,ck,12,1,6.5354,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,410.74,1173.26,0.0 -256,640,4096,3072,asm,24,1,17.8108,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,904.29,1111.23,0.0 -256,672,4096,512,ck,9,2,6.5833,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,428.14,1207.03,0.0 -256,672,4096,3072,asm,32,1,19.4713,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,868.53,1034.98,0.0 -256,704,4096,512,ck,11,2,6.711,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,439.99,1225.57,0.0 -256,704,4096,3072,asm,32,1,19.1316,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,926.05,1072.19,0.0 -256,736,4096,512,ck,11,2,7.0987,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,434.87,1197.87,0.0 -256,736,4096,3072,asm,32,1,19.2547,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,961.95,1084.06,0.0 -256,768,4096,512,ck,9,0,6.8799,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,468.21,1276.45,0.0 -256,768,4096,3072,asm,32,1,19.3192,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,1000.42,1099.1,0.0 -256,800,4096,512,ck,9,3,8.1419,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,412.12,1112.81,0.0 -256,800,4096,3072,asm,40,1,23.1091,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,871.2,934.44,0.0 -256,832,4096,512,ck,16,0,8.2291,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,424.06,1134.86,0.0 -256,832,4096,3072,ck,14,2,22.6787,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,923.24,968.07,0.0001 -256,864,4096,512,ck,14,2,8.1323,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,445.62,1182.62,0.0 -256,864,4096,3072,ck,14,3,23.4387,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,927.67,952.06,0.0001 -256,896,4096,512,ck,12,1,8.2695,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,454.45,1196.68,0.0 -256,896,4096,3072,ck,14,1,23.3659,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,965.02,970.45,0.0001 -256,928,4096,512,ck,16,2,8.5579,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,454.82,1188.9,0.0 -256,928,4096,3072,asm,40,1,23.5093,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,993.39,979.86,0.0 -256,960,4096,512,ck,16,1,8.4661,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,475.61,1234.69,0.0 -256,960,4096,3072,ck,14,3,23.7112,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1018.89,986.72,0.0001 -256,992,4096,512,ck,16,2,8.7551,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,475.24,1225.74,0.0 -256,992,4096,3072,asm,40,1,23.5538,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1059.89,1008.62,0.0 -256,1024,4096,512,ck,16,3,8.7835,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,488.98,1253.49,0.0 -256,1024,4096,3072,asm,40,1,23.8252,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1081.62,1012.26,0.0 -256,1088,4096,512,ck,11,3,10.1949,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,447.62,1134.6,0.0 -256,1088,4096,3072,ck,14,1,29.8181,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,918.25,832.99,0.0002 -256,1152,4096,512,ck,11,1,10.1493,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,476.08,1194.58,0.0 -256,1152,4096,3072,ck,14,2,29.8302,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,971.87,856.82,0.0002 -256,1216,4096,512,ck,9,0,10.3706,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,491.8,1222.8,0.0 -256,1216,4096,3072,ck,14,2,29.9089,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1023.16,878.67,0.0001 -256,1280,4096,512,ck,11,1,10.1941,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,526.65,1298.62,0.0 -256,1280,4096,3072,ck,14,0,30.2437,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1065.09,892.78,0.0002 -256,1344,4096,512,ck,11,3,10.9237,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,516.05,1262.88,0.0 -256,1344,4096,3072,ck,14,0,31.1831,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1084.65,889.0,0.0001 -256,1408,4096,512,ck,14,3,11.0822,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,532.89,1295.08,0.0001 -256,1408,4096,3072,ck,14,1,31.7025,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1117.69,897.17,0.0002 -256,1472,4096,512,ck,14,1,11.2016,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,551.17,1331.01,0.0001 -256,1472,4096,3072,ck,14,1,31.8961,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1161.4,914.33,0.0002 -256,1536,4096,512,ck,14,3,11.3212,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,569.06,1366.15,0.0002 -256,1536,4096,3072,ck,13,1,32.3752,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1193.96,923.07,0.0 -256,1600,4096,512,ck,11,1,12.1644,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,551.68,1317.25,0.0 -256,1600,4096,3072,ck,13,3,33.1053,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1216.28,924.48,0.0 -256,1664,4096,512,ck,11,1,12.2937,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,567.72,1348.71,0.0 -256,1664,4096,3072,ck,13,3,33.6312,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1245.15,931.46,0.0 -256,1728,4096,512,ck,11,1,12.4573,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,581.81,1375.71,0.0 -256,1728,4096,3072,ck,13,3,33.2495,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1307.89,963.84,0.0 -256,1792,4096,512,ck,11,3,12.5727,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,597.82,1407.39,0.0 -256,1792,4096,3072,ck,13,3,33.6704,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1339.37,973.2,0.0 -256,1856,4096,512,ck,11,0,13.3033,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,585.17,1371.97,0.0 -256,1856,4096,3072,ck,13,2,35.6071,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1311.75,940.51,0.0 -256,1920,4096,512,ck,11,3,13.6155,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,591.46,1381.43,0.0 -256,1920,4096,3072,ck,13,3,37.0853,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1302.9,922.46,0.0 -256,1984,4096,512,ck,16,0,13.8619,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,600.31,1397.06,0.0 -256,1984,4096,3072,ck,13,3,38.9825,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1280.81,896.06,0.0 -256,2048,4096,512,ck,9,2,14.4383,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,594.94,1379.87,0.0 -256,2048,4096,3072,ck,13,1,39.4437,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1306.66,903.86,0.0 -256,4096,4096,512,ck,16,3,23.585,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,728.42,1600.54,0.0 -256,4096,4096,3072,ck,13,2,71.402,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1443.65,822.39,0.0 -256,8192,4096,512,ck,16,1,41.9105,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,819.84,1751.36,0.0 -256,8192,4096,3072,ck,13,2,132.4593,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1556.39,791.62,0.0 -256,16384,4096,512,ck,16,2,78.2029,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,878.73,1850.36,0.0 -256,16384,4096,3072,ck,13,3,248.0474,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1662.25,794.74,0.0 -256,32768,4096,512,ck,16,2,143.2204,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,959.63,2006.07,0.0 -256,32768,4096,3072,ck,13,3,481.3425,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1713.2,792.95,0.0 -256,16,4608,7168,asm,22,7,10.2718,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,102.9,3241.13,0.0059 -256,32,4608,7168,asm,29,6,11.6205,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,181.91,2887.52,0.004 -256,48,4608,7168,asm,13,6,13.7652,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,230.36,2456.67,0.0037 -256,64,4608,7168,asm,29,6,12.857,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,328.84,2650.6,0.0036 -256,80,4608,7168,asm,37,6,14.7725,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,357.75,2324.65,0.0047 -256,96,4608,7168,asm,37,6,15.4561,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,410.31,2238.79,0.0039 -256,112,4608,7168,asm,18,3,17.8925,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,413.51,1948.59,0.0004 -256,128,4608,7168,asm,18,3,18.0058,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,469.61,1950.89,0.0004 -256,144,4608,7168,asm,26,3,20.2285,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,470.26,1749.48,0.0004 -256,160,4608,7168,asm,26,3,20.8025,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,508.09,1713.81,0.0004 -256,176,4608,7168,asm,34,3,21.847,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,532.18,1643.87,0.0004 -256,192,4608,7168,ck,17,3,21.9076,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,578.96,1651.29,0.0 -256,208,4608,7168,asm,42,3,23.3806,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,587.69,1558.47,0.0004 -256,224,4608,7168,ck,11,3,23.044,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,642.14,1592.61,0.0 -256,240,4608,7168,asm,25,2,25.8285,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,613.84,1431.07,0.0 -256,256,4608,7168,asm,33,2,26.9625,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,627.22,1380.6,0.0 -256,288,4608,7168,asm,33,2,27.8307,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,683.61,1356.37,0.0 -256,320,4608,7168,asm,8,1,29.3886,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,719.3,1302.31,0.0 -256,352,4608,7168,asm,41,2,30.6762,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,758.02,1264.74,0.0 -256,384,4608,7168,asm,16,1,30.5188,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,831.2,1288.44,0.0 -256,416,4608,7168,asm,16,1,30.6272,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,897.28,1301.0,0.0 -256,448,4608,7168,asm,16,1,31.1697,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,949.48,1295.17,0.0 -256,480,4608,7168,asm,24,1,33.299,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,952.25,1228.1,0.0 -256,512,4608,7168,asm,24,1,33.577,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,1007.32,1233.55,0.0 -256,544,4608,7168,asm,24,1,33.9623,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,1058.14,1234.99,0.0 -256,576,4608,7168,asm,32,1,36.4667,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,1043.44,1164.55,0.0 -256,608,4608,7168,asm,32,1,39.2737,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,1022.69,1094.67,0.0 -256,640,4608,7168,asm,32,1,37.7105,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,1121.14,1153.95,0.0 -256,672,4608,7168,asm,32,1,38.6714,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,1147.94,1138.83,0.0 -256,704,4608,7168,asm,40,1,45.5781,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1020.37,977.76,0.0 -256,736,4608,7168,asm,40,1,47.3086,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1027.73,953.08,0.0 -256,768,4608,7168,asm,40,1,46.0129,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1102.61,991.31,0.0 -256,800,4608,7168,asm,40,1,47.395,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1115.06,973.46,0.0 -256,832,4608,7168,ck,13,1,48.1976,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1140.35,968.13,0.0 -256,864,4608,7168,asm,40,1,49.4742,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1153.65,953.75,0.0 -256,896,4608,7168,ck,13,0,49.2493,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1201.84,968.75,0.0 -256,928,4608,7168,ck,14,2,61.0416,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1004.3,790.19,0.0001 -256,960,4608,7168,ck,13,2,60.7415,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1044.06,802.73,0.0 -256,992,4608,7168,ck,14,2,64.9887,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1008.36,758.33,0.0001 -256,1024,4608,7168,ck,14,3,64.9234,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1041.93,767.17,0.0001 -256,1088,4608,7168,ck,14,0,62.6762,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1146.74,811.41,0.0002 -256,1152,4608,7168,ck,13,2,63.8589,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1191.71,812.8,0.0 -256,1216,4608,7168,ck,14,3,64.0235,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1254.68,827.09,0.0001 -256,1280,4608,7168,ck,13,3,65.4649,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1291.64,824.89,0.0 -256,1344,4608,7168,ck,14,2,68.5828,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1294.57,802.68,0.0002 -256,1408,4608,7168,ck,13,1,65.8692,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1412.08,851.67,0.0 -256,1472,4608,7168,ck,13,3,69.1204,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1406.83,826.78,0.0 -256,1536,4608,7168,ck,13,3,73.8412,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1374.15,788.12,0.0 -256,1600,4608,7168,ck,13,2,74.9489,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1410.25,790.47,0.0 -256,1664,4608,7168,ck,13,0,76.3776,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1439.22,789.41,0.0 -256,1728,4608,7168,ck,13,1,77.1936,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1478.78,794.65,0.0 -256,1792,4608,7168,ck,13,0,78.3613,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1510.7,796.19,0.0 -256,1856,4608,7168,ck,14,1,99.0642,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1237.66,640.38,0.0002 -256,1920,4608,7168,ck,14,2,99.9722,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1268.71,645.05,0.0001 -256,1984,4608,7168,ck,13,3,101.9007,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1286.19,643.14,0.0 -256,2048,4608,7168,ck,14,2,105.2531,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1285.39,632.61,0.0002 -256,4096,4608,7168,ck,13,2,171.4096,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1578.58,584.21,0.0 -256,8192,4608,7168,ck,13,0,305.5678,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1771.02,547.33,0.0 -256,16384,4608,7168,ck,13,3,574.1814,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1885.0,525.04,0.0 -256,32768,4608,7168,ck,13,2,1114.2846,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1942.65,511.45,0.0 -256,16,7168,2048,ck,7,3,7.2132,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,65.13,2071.51,0.0 -256,16,7168,2304,ck,7,0,8.067,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,65.51,2080.24,0.0 -256,32,7168,2048,ck,7,0,7.222,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,130.09,2105.28,0.0 -256,32,7168,2304,ck,7,3,8.253,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,128.07,2065.62,0.0 -256,48,7168,2048,ck,6,3,7.9865,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,176.46,1936.58,0.0 -256,48,7168,2304,ck,7,0,9.4657,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,167.49,1829.11,0.0 -256,64,7168,2048,ck,6,1,8.335,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,225.44,1887.06,0.0 -256,64,7168,2304,ck,12,0,9.5781,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,220.7,1835.44,0.0 -256,80,7168,2048,ck,11,1,9.4105,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,249.59,1699.25,0.0 -256,80,7168,2304,ck,17,2,9.9321,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,266.05,1796.83,0.0 -256,96,7168,2048,ck,11,2,9.625,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,292.84,1688.62,0.0 -256,96,7168,2304,ck,17,1,9.9563,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,318.48,1819.2,0.0 -256,112,7168,2048,ck,11,1,9.4953,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,346.31,1739.29,0.0 -256,112,7168,2304,ck,17,0,10.0093,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,369.59,1836.17,0.0 -256,128,7168,2048,ck,11,0,9.471,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,396.8,1771.43,0.0 -256,128,7168,2304,ck,17,3,10.0901,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,419.01,1847.85,0.0 -256,144,7168,2048,asm,8,1,11.1728,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,378.41,1525.08,0.0 -256,144,7168,2304,asm,8,1,12.1844,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,390.36,1552.09,0.0 -256,160,7168,2048,asm,8,1,11.2214,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,418.63,1541.83,0.0 -256,160,7168,2304,asm,8,1,12.1653,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,434.42,1576.41,0.0 -256,176,7168,2048,asm,8,1,11.5129,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,448.83,1525.56,0.0 -256,176,7168,2304,asm,8,1,12.5851,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,461.92,1544.98,0.0 -256,192,7168,2048,asm,8,1,11.6453,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,484.07,1530.73,0.0 -256,192,7168,2304,asm,8,1,12.6184,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,502.58,1562.0,0.0 -256,208,7168,2048,asm,16,1,12.0625,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,506.27,1499.52,0.0 -256,208,7168,2304,asm,16,1,13.1559,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,522.22,1518.42,0.0 -256,224,7168,2048,ck,17,0,12.3478,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,532.62,1486.1,0.0 -256,224,7168,2304,asm,16,1,13.3633,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,553.66,1514.78,0.0 -256,240,7168,2048,asm,16,1,12.4016,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,568.19,1500.79,0.0 -256,240,7168,2304,asm,16,1,13.2042,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,600.36,1553.19,0.0 -256,256,7168,2048,asm,16,1,12.2335,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,614.39,1542.84,0.0 -256,256,7168,2304,asm,16,1,13.6382,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,620.0,1523.29,0.0 -256,288,7168,2048,asm,24,1,13.7055,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,616.96,1415.39,0.0 -256,288,7168,2304,asm,24,1,14.4966,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,656.2,1469.82,0.0 -256,320,7168,2048,asm,24,1,13.4623,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,697.89,1479.91,0.0 -256,320,7168,2304,asm,24,1,14.5582,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,726.03,1500.18,0.0 -256,352,7168,2048,asm,32,1,14.96,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,690.83,1366.79,0.0 -256,352,7168,2304,asm,32,1,15.663,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,742.3,1428.36,0.0 -256,384,7168,2048,asm,32,1,14.6886,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,767.55,1427.74,0.0 -256,384,7168,2304,asm,32,1,16.4306,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,771.95,1394.04,0.0 -256,416,7168,2048,ck,14,3,16.8019,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,726.93,1279.37,0.0001 -256,416,7168,2304,ck,14,0,18.9316,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,725.8,1238.0,0.0 -256,448,7168,2048,ck,14,0,16.6168,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,791.57,1325.17,0.0001 -256,448,7168,2304,ck,14,2,18.9489,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,780.92,1264.97,0.0 -256,480,7168,2048,ck,14,2,17.1416,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,822.14,1315.19,0.0001 -256,480,7168,2304,asm,40,1,19.1807,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,826.58,1277.44,0.0 -256,512,7168,2048,ck,14,0,17.088,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,879.7,1349.99,0.0002 -256,512,7168,2304,ck,14,0,19.4496,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,869.5,1287.16,0.0 -256,544,7168,2048,ck,14,1,17.8708,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,893.74,1320.2,0.0001 -256,544,7168,2304,ck,14,1,20.1182,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,893.14,1270.85,0.0 -256,576,7168,2048,ck,14,3,17.8731,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,946.19,1349.36,0.0002 -256,576,7168,2304,ck,14,2,20.0657,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,948.15,1300.71,0.0 -256,608,7168,2048,ck,14,0,21.9319,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,813.93,1123.55,0.0001 -256,608,7168,2304,ck,14,0,24.7891,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,810.13,1074.35,0.0001 -256,640,7168,2048,ck,14,3,21.8952,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,858.2,1149.38,0.0002 -256,640,7168,2304,ck,14,1,24.582,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,859.95,1105.06,0.0001 -256,672,7168,2048,ck,14,0,22.3856,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,881.37,1147.62,0.0001 -256,672,7168,2304,ck,14,0,25.3952,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,874.03,1090.65,0.0001 -256,704,7168,2048,ck,14,0,22.1673,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,932.43,1182.57,0.0001 -256,704,7168,2304,ck,14,2,24.9362,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,932.51,1132.07,0.0001 -256,736,7168,2048,ck,14,3,22.8087,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,947.4,1172.3,0.0002 -256,736,7168,2304,ck,14,3,25.8221,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,941.45,1113.86,0.0001 -256,768,7168,2048,ck,14,1,22.8406,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,987.21,1193.62,0.0002 -256,768,7168,2304,ck,14,0,25.4961,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,994.94,1148.98,0.0002 -256,800,7168,2048,ck,14,0,23.6361,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,993.74,1175.63,0.0002 -256,800,7168,2304,ck,14,1,26.6837,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,990.27,1117.8,0.0001 -256,832,7168,2048,ck,14,0,23.2242,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1051.82,1219.05,0.0003 -256,832,7168,2304,ck,14,1,26.6718,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1030.34,1138.26,0.0002 -256,864,7168,2048,ck,13,0,25.6049,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,990.71,1126.18,0.0 -256,864,7168,2304,ck,13,3,28.3077,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1008.14,1091.29,0.0 -256,896,7168,2048,ck,13,0,25.6653,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1024.99,1143.96,0.0 -256,896,7168,2304,ck,13,0,27.9433,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1059.11,1124.58,0.0 -256,928,7168,2048,ck,13,3,26.5242,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1027.22,1126.68,0.0 -256,928,7168,2304,ck,13,0,29.6559,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1033.59,1077.59,0.0 -256,960,7168,2048,ck,13,1,26.7867,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1052.23,1135.22,0.0 -256,960,7168,2304,ck,13,1,29.2621,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1083.62,1110.29,0.0 -256,992,7168,2048,ck,13,2,27.1511,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1072.71,1139.29,0.0 -256,992,7168,2304,ck,13,0,30.3926,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1078.09,1086.51,0.0 -256,1024,7168,2048,ck,13,2,26.9597,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1115.17,1166.83,0.0 -256,1024,7168,2304,ck,13,2,30.2979,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1116.34,1107.48,0.0 -256,1088,7168,2048,ck,13,2,27.2805,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1170.94,1191.54,0.0 -256,1088,7168,2304,ck,13,3,30.6045,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1174.23,1131.19,0.0 -256,1152,7168,2048,ck,13,1,27.4994,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1229.95,1220.19,0.0 -256,1152,7168,2304,ck,13,1,30.6421,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1241.78,1164.55,0.0 -256,1216,7168,2048,ck,14,1,33.3229,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1071.39,1038.42,0.0003 -256,1216,7168,2304,ck,14,1,38.9553,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1031.04,943.37,0.0002 -256,1280,7168,2048,ck,14,1,33.8453,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1110.37,1053.37,0.0003 -256,1280,7168,2304,ck,14,1,38.5149,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1097.72,981.81,0.0003 -256,1344,7168,2048,ck,14,1,35.0697,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1125.19,1046.49,0.0003 -256,1344,7168,2304,ck,14,1,39.8326,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1114.48,976.07,0.0002 -256,1408,7168,2048,ck,14,1,34.9055,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1184.31,1081.46,0.0003 -256,1408,7168,2304,ck,14,0,40.0446,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1161.37,997.49,0.0002 -256,1472,7168,2048,ck,13,2,37.8797,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1140.93,1024.22,0.0 -256,1472,7168,2304,ck,13,0,41.5178,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1171.07,987.75,0.0 -256,1536,7168,2048,ck,13,1,37.7413,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1194.9,1055.76,0.0 -256,1536,7168,2304,ck,13,3,41.4718,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1223.34,1014.52,0.0 -256,1600,7168,2048,ck,13,0,38.8337,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1209.68,1053.07,0.0 -256,1600,7168,2304,ck,13,2,42.9042,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1231.77,1005.47,0.0 -256,1664,7168,2048,ck,13,1,39.2573,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1244.49,1068.41,0.0 -256,1664,7168,2304,ck,13,2,43.621,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1259.99,1013.37,0.0 -256,1728,7168,2048,ck,13,0,39.8069,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1274.51,1080.0,0.0 -256,1728,7168,2304,ck,13,3,44.4322,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1284.57,1018.83,0.0 -256,1792,7168,2048,ck,14,2,45.809,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1148.54,961.39,0.0003 -256,1792,7168,2304,ck,13,2,49.8498,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1187.37,929.47,0.0 -256,1856,7168,2048,ck,14,2,46.3162,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1176.53,973.5,0.0003 -256,1856,7168,2304,ck,14,1,52.4771,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1168.2,903.23,0.0003 -256,1920,7168,2048,ck,13,2,46.9258,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1201.29,983.2,0.0 -256,1920,7168,2304,ck,13,1,52.4563,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1208.97,923.89,0.0 -256,1984,7168,2048,ck,13,0,48.5366,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1200.14,972.17,0.0 -256,1984,7168,2304,ck,13,3,54.0152,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1213.21,916.94,0.0 -256,2048,7168,2048,ck,13,1,49.7429,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1208.81,969.68,0.0 -256,2048,7168,2304,ck,13,1,54.9991,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1229.94,919.9,0.0 -256,4096,7168,2048,ck,13,1,84.9609,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1415.46,962.67,0.0 -256,4096,7168,2304,ck,13,0,94.7138,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1428.42,893.98,0.0 -256,8192,7168,2048,ck,13,0,157.5272,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1526.84,945.22,0.0 -256,8192,7168,2304,ck,13,3,171.0665,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1581.74,893.39,0.0 -256,16384,7168,2048,ck,13,1,291.9559,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1647.63,969.72,0.0 -256,16384,7168,2304,ck,13,1,324.2426,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1669.02,891.75,0.0 -256,32768,7168,2048,ck,13,1,556.7794,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1727.92,990.61,0.0 -256,32768,7168,2304,ck,13,0,619.9048,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1745.96,906.23,0.0 -256,16,8192,1536,ck,7,2,6.4719,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,62.22,1988.54,0.0 -256,32,8192,1536,ck,7,2,6.2694,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,128.45,2098.5,0.0 -256,48,8192,1536,ck,6,0,6.7969,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,177.72,1977.82,0.0 -256,64,8192,1536,ck,12,1,6.8746,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,234.28,1997.18,0.0 -256,80,8192,1536,ck,12,0,8.1128,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,248.16,1727.7,0.0 -256,96,8192,1536,ck,11,3,7.6815,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,314.51,1862.04,0.0 -256,112,8192,1536,ck,12,1,8.0886,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,348.46,1803.77,0.0 -256,128,8192,1536,ck,11,0,8.3009,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,388.06,1792.18,0.0 -256,144,8192,1536,asm,16,1,10.0466,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,360.71,1509.31,0.0 -256,160,8192,1536,asm,8,1,9.5079,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,423.49,1624.98,0.0 -256,176,8192,1536,asm,8,1,10.041,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,441.11,1567.26,0.0 -256,192,8192,1536,asm,8,1,9.735,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,496.34,1645.97,0.0 -256,208,8192,1536,ck,17,1,10.774,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,485.84,1513.85,0.0 -256,224,8192,1536,asm,16,1,10.472,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,538.31,1584.89,0.0 -256,240,8192,1536,ck,17,1,11.016,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,548.28,1532.65,0.0 -256,256,8192,1536,asm,16,1,10.6076,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,607.34,1618.69,0.0 -256,288,8192,1536,asm,24,1,11.5847,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,625.63,1531.66,0.0 -256,320,8192,1536,asm,24,1,11.8865,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,677.5,1541.02,0.0 -256,352,8192,1536,asm,32,1,13.3519,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,663.45,1414.84,0.0 -256,384,8192,1536,asm,32,1,13.0874,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,738.4,1487.25,0.0 -256,416,8192,1536,ck,14,0,15.0583,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,695.23,1330.67,0.0 -256,448,8192,1536,ck,14,2,14.9824,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,752.5,1375.69,0.0 -256,480,8192,1536,ck,14,3,15.2947,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,789.79,1385.09,0.0 -256,512,8192,1536,ck,14,3,15.175,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,849.09,1433.8,0.0 -256,544,8192,1536,ck,14,0,19.3452,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,707.68,1154.36,0.0001 -256,576,8192,1536,ck,14,3,19.3504,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,749.11,1183.69,0.0001 -256,608,8192,1536,ck,14,1,19.5196,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,783.87,1202.8,0.0001 -256,640,8192,1536,ck,14,0,19.295,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,834.73,1246.53,0.0001 -256,672,8192,1536,ck,14,2,19.9677,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,846.94,1233.25,0.0002 -256,704,8192,1536,ck,14,0,19.791,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,895.19,1273.23,0.0001 -256,736,8192,1536,ck,14,2,20.4392,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,906.2,1260.91,0.0002 -256,768,8192,1536,ck,14,0,20.2904,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,952.54,1298.42,0.0002 -256,800,8192,1536,ck,13,2,22.2979,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,902.89,1207.24,0.0 -256,832,8192,1536,ck,13,0,22.2371,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,941.58,1236.33,0.0 -256,864,8192,1536,ck,13,3,23.0882,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,941.75,1215.59,0.0 -256,896,8192,1536,ck,13,3,22.8541,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,986.63,1253.13,0.0 -256,928,8192,1536,ck,13,0,23.3637,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,999.58,1250.34,0.0 -256,960,8192,1536,ck,13,0,23.2165,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1040.6,1282.97,0.0 -256,992,8192,1536,ck,13,3,24.0286,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1038.95,1263.48,0.0 -256,1024,8192,1536,ck,13,0,23.849,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1080.54,1297.04,0.0 -256,1088,8192,1536,ck,14,1,28.527,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,959.81,1124.54,0.0001 -256,1152,8192,1536,ck,14,2,28.3804,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1021.52,1170.76,0.0002 -256,1216,8192,1536,ck,14,2,28.9229,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1058.04,1188.46,0.0002 -256,1280,8192,1536,ck,14,1,30.0017,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1073.68,1183.95,0.0002 -256,1344,8192,1536,ck,13,2,31.5777,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1071.1,1161.18,0.0 -256,1408,8192,1536,ck,13,0,31.753,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1115.91,1190.89,0.0 -256,1472,8192,1536,ck,13,1,32.3237,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1146.04,1205.34,0.0 -256,1536,8192,1536,ck,13,3,32.6491,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1183.94,1228.46,0.0 -256,1600,8192,1536,ck,14,3,38.1646,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1055.04,1080.97,0.0002 -256,1664,8192,1536,ck,14,3,38.4425,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1089.31,1102.99,0.0002 -256,1728,8192,1536,ck,14,3,38.6445,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1125.3,1126.9,0.0002 -256,1792,8192,1536,ck,14,1,39.7512,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1134.49,1124.38,0.0002 -256,1856,8192,1536,ck,13,2,41.175,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1134.37,1113.36,0.0 -256,1920,8192,1536,ck,13,2,41.7573,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1157.12,1125.3,0.0 -256,1984,8192,1536,ck,13,2,42.9929,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1161.33,1119.63,0.0 -256,2048,8192,1536,ck,13,3,44.712,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1152.7,1102.23,0.0 -256,4096,8192,1536,ck,13,1,79.2213,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1301.16,1085.35,0.0 -256,8192,8192,1536,ck,13,2,146.3103,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1409.05,1089.35,0.0 -256,16384,8192,1536,ck,13,3,271.6202,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1517.99,1127.25,0.0 -256,32768,8192,1536,ck,13,1,513.6065,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1605.57,1167.79,0.0 -256,16,11264,1536,ck,7,1,6.7619,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,81.88,2615.62,0.0 -256,32,11264,1536,ck,6,1,7.096,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,156.05,2546.72,0.0 -256,48,11264,1536,ck,7,0,7.5363,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,220.39,2449.02,0.0 -256,64,11264,1536,ck,17,2,8.3027,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,266.73,2269.33,0.0 -256,80,11264,1536,asm,8,1,9.794,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,282.65,1963.1,0.0 -256,96,11264,1536,asm,8,1,9.9361,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,334.33,1973.78,0.0 -256,112,11264,1536,ck,12,3,10.4055,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,372.45,1921.74,0.0 -256,128,11264,1536,ck,10,2,10.6475,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,415.98,1914.22,0.0227 -256,144,11264,1536,ck,9,2,11.863,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,420.03,1750.55,0.0 -256,160,11264,1536,ck,9,1,11.9836,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,462.0,1765.06,0.0 -256,176,11264,1536,ck,15,2,12.9368,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,470.76,1664.77,0.0091 -256,192,11264,1536,asm,32,1,12.7768,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,519.99,1715.75,0.0 -256,208,11264,1536,ck,15,3,13.6827,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,526.02,1630.29,0.0112 -256,224,11264,1536,ck,15,1,13.86,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,559.24,1637.22,0.0122 -256,240,11264,1536,ck,15,1,14.1955,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,585.02,1625.65,0.0107 -256,256,11264,1536,ck,15,3,14.3361,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,617.91,1636.56,0.0013 -256,288,11264,1536,ck,14,3,15.2132,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,655.07,1592.82,0.0 -256,320,11264,1536,ck,14,0,15.3302,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,722.3,1630.9,0.0 -256,352,11264,1536,ck,8,2,17.669,a8w8_blockscale_bpreshuffle_1x128x128_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,689.36,1458.6,0.0 -256,384,11264,1536,ck,14,2,18.7495,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,708.69,1415.62,0.0 -256,416,11264,1536,ck,14,0,19.4889,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,738.62,1401.42,0.0001 -256,448,11264,1536,ck,14,2,19.4734,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,796.07,1442.08,0.0001 -256,480,11264,1536,ck,14,1,19.9547,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,832.36,1445.89,0.0001 -256,512,11264,1536,ck,14,1,19.8498,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,892.54,1492.32,0.0001 -256,544,11264,1536,ck,13,0,22.6047,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,832.75,1344.51,0.0 -256,576,11264,1536,ck,13,2,22.3751,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,890.78,1392.73,0.0 -256,608,11264,1536,ck,13,0,23.0538,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,912.59,1385.13,0.0 -256,640,11264,1536,ck,13,2,23.2775,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,951.39,1404.9,0.0 -256,672,11264,1536,ck,13,2,23.5796,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,986.16,1419.55,0.0 -256,704,11264,1536,ck,13,1,23.4414,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1039.21,1460.77,0.0 -256,736,11264,1536,ck,14,3,28.0696,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,907.31,1247.35,0.0002 -256,768,11264,1536,ck,14,2,27.9392,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,951.18,1280.73,0.0002 -256,800,11264,1536,ck,14,0,28.7283,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,963.59,1272.36,0.0002 -256,832,11264,1536,ck,14,3,29.2751,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,983.42,1274.9,0.0002 -256,864,11264,1536,ck,14,0,29.5972,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1010.13,1287.04,0.0002 -256,896,11264,1536,ck,14,1,29.5197,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1050.29,1316.51,0.0002 -256,928,11264,1536,ck,13,1,31.8538,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1008.09,1244.21,0.0 -256,960,11264,1536,ck,13,3,31.3254,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1060.45,1289.78,0.0 -256,992,11264,1536,ck,13,0,35.4472,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,968.38,1161.53,0.0 -256,1024,11264,1536,ck,13,0,35.0248,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1011.67,1197.52,0.0 -256,1088,11264,1536,ck,13,0,33.7656,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1114.98,1287.79,0.0 -256,1152,11264,1536,ck,14,2,36.3374,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1097.01,1239.03,0.0002 -256,1216,11264,1536,ck,14,0,38.9925,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1079.11,1194.16,0.0003 -256,1280,11264,1536,ck,13,1,39.6937,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1115.84,1211.87,0.0 -256,1344,11264,1536,ck,13,1,42.2937,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1099.61,1173.78,0.0 -256,1408,11264,1536,ck,13,3,41.3354,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1178.68,1238.25,0.0 -256,1472,11264,1536,ck,13,3,41.1898,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1236.61,1280.02,0.0 -256,1536,11264,1536,ck,13,3,46.8067,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1135.53,1159.32,0.0 -256,1600,11264,1536,ck,13,2,48.4414,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1142.92,1151.99,0.0 -256,1664,11264,1536,ck,13,2,48.8634,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1178.37,1173.56,0.0 -256,1728,11264,1536,ck,13,3,48.7658,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1226.15,1207.49,0.0 -256,1792,11264,1536,ck,13,0,49.6094,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1249.94,1218.0,0.0 -256,1856,11264,1536,ck,13,1,52.3927,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1225.8,1182.69,0.0 -256,1920,11264,1536,ck,13,3,58.0171,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1145.14,1094.58,0.0 -256,1984,11264,1536,ck,13,1,58.9391,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1164.8,1103.59,0.0 -256,2048,11264,1536,ck,13,1,61.3891,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1154.39,1084.63,0.0 -256,4096,11264,1536,ck,13,1,104.1395,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1361.0,1112.62,0.0 -256,8192,11264,1536,ck,13,1,191.8263,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1477.73,1117.85,0.0 -256,16384,11264,1536,ck,13,1,360.1565,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1574.14,1142.74,0.0 -256,32768,11264,1536,ck,13,2,679.5554,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1668.55,1185.82,0.0 +gfx,cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,16,128,7168,asm,6,7,6.1485,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,4.78,168.54,0.0073 +gfx950,256,32,128,7168,asm,7,8,6.3047,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,9.31,183.21,0.0012 +gfx950,256,48,128,7168,asm,6,7,6.5243,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,13.5,195.25,0.0073 +gfx950,256,64,128,7168,asm,7,8,6.337,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,18.53,219.76,0.0098 +gfx950,256,80,128,7168,asm,7,8,8.0249,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,18.29,188.34,0.0045 +gfx950,256,96,128,7168,asm,15,8,8.8365,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,19.94,184.49,0.0107 +gfx950,256,112,128,7168,asm,6,7,7.6026,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,27.03,230.05,0.0042 +gfx950,256,128,128,7168,asm,6,7,8.8728,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,26.47,210.51,0.0092 +gfx950,256,144,128,7168,asm,7,8,8.0541,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,32.81,246.65,0.0111 +gfx950,256,160,128,7168,asm,7,8,8.4521,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,34.74,249.09,0.0076 +gfx950,256,176,128,7168,asm,23,8,7.9649,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,40.55,279.24,0.0048 +gfx950,256,192,128,7168,asm,7,8,8.7548,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,40.24,267.61,0.0093 +gfx950,256,208,128,7168,asm,7,8,7.5139,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,50.8,327.62,0.0117 +gfx950,256,224,128,7168,asm,30,7,9.2902,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,44.24,277.76,0.0085 +gfx950,256,240,128,7168,asm,7,8,9.2722,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,47.5,291.11,0.0119 +gfx950,256,256,128,7168,asm,15,8,9.4713,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,49.6,297.54,0.0047 +gfx950,256,288,128,7168,asm,15,8,8.3144,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,63.56,367.51,0.0025 +gfx950,256,320,128,7168,asm,7,8,9.8117,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,59.85,335.64,0.0078 +gfx950,256,352,128,7168,asm,7,8,9.4965,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,68.02,371.8,0.0046 +gfx950,256,384,128,7168,asm,7,8,10.0883,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,69.85,373.53,0.0139 +gfx950,256,416,128,7168,asm,7,8,10.1166,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,75.46,395.97,0.0081 +gfx950,256,448,128,7168,asm,5,6,10.0989,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,81.4,420.19,0.0065 +gfx950,256,480,128,7168,asm,31,8,9.5986,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,91.76,466.84,0.0125 +gfx950,256,512,128,7168,asm,7,8,10.208,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,92.04,462.24,0.0076 +gfx950,256,544,128,7168,asm,7,8,9.4877,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,105.21,522.38,0.0042 +gfx950,256,576,128,7168,asm,7,8,9.4869,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,111.41,547.46,0.0102 +gfx950,256,608,128,7168,asm,7,8,10.3885,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,107.4,522.82,0.0071 +gfx950,256,640,128,7168,asm,7,8,9.5917,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,122.44,591.02,0.0014 +gfx950,256,672,128,7168,asm,7,8,9.8577,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,125.09,599.17,0.005 +gfx950,256,704,128,7168,asm,7,8,10.3689,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,124.59,592.54,0.0098 +gfx950,256,736,128,7168,asm,7,8,10.5103,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,128.5,607.17,0.0015 +gfx950,256,768,128,7168,asm,15,8,10.7333,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,131.3,616.69,0.002 +gfx950,256,800,128,7168,asm,15,8,10.554,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,139.09,649.68,0.0109 +gfx950,256,832,128,7168,asm,7,8,9.7953,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,155.86,724.25,0.0094 +gfx950,256,864,128,7168,asm,7,8,10.7375,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,147.66,682.83,0.0101 +gfx950,256,896,128,7168,asm,7,8,10.8442,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,151.62,698.01,0.0089 +gfx950,256,928,128,7168,asm,15,8,11.0012,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,154.79,709.65,0.0108 +gfx950,256,960,128,7168,asm,7,8,10.8867,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,161.81,738.93,0.0155 +gfx950,256,992,128,7168,asm,7,8,11.0077,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,165.37,752.39,0.0145 +gfx950,256,1024,128,7168,asm,7,8,9.6737,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,194.24,880.71,0.013 +gfx950,256,1088,128,7168,asm,15,8,10.5533,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,189.18,852.32,0.0056 +gfx950,256,1152,128,7168,asm,15,8,11.1072,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,190.32,852.6,0.0109 +gfx950,256,1216,128,7168,asm,15,8,11.6079,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,192.23,856.75,0.0121 +gfx950,256,1280,128,7168,asm,31,8,10.9477,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,214.55,951.82,0.0098 +gfx950,256,1344,128,7168,asm,39,8,10.7369,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,229.7,1014.76,0.0017 +gfx950,256,1408,128,7168,asm,15,8,10.2201,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,252.8,1112.56,0.009 +gfx950,256,1472,128,7168,asm,14,7,11.8021,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,228.87,1003.69,0.0082 +gfx950,256,1536,128,7168,asm,15,8,10.5213,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,267.89,1171.03,0.0109 +gfx950,256,1600,128,7168,asm,22,7,12.0077,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,244.51,1065.64,0.0032 +gfx950,256,1664,128,7168,asm,14,7,12.3669,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,246.91,1073.11,0.0067 +gfx950,256,1728,128,7168,asm,31,8,12.5593,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,252.47,1094.5,0.0063 +gfx950,256,1792,128,7168,asm,21,6,12.5025,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,263.01,1137.48,0.0064 +gfx950,256,1856,128,7168,asm,13,6,12.1277,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,280.83,1211.81,0.0011 +gfx950,256,1920,128,7168,asm,22,7,12.0941,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,291.32,1254.46,0.0058 +gfx950,256,1984,128,7168,asm,29,6,12.2402,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,297.43,1278.31,0.0057 +gfx950,256,2048,128,7168,asm,31,8,11.5875,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,324.32,1391.31,0.0105 +gfx950,256,4096,128,7168,asm,18,3,16.9007,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,444.73,1853.55,0.0002 +gfx950,256,8192,128,7168,ck,17,1,22.7549,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,660.62,2713.04,0.0 +gfx950,256,16384,128,7168,asm,16,1,36.8278,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,816.36,3327.71,0.0 +gfx950,256,32768,128,7168,asm,40,1,57.9434,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1037.73,4214.24,0.0 +gfx950,256,16,2112,7168,ck,7,3,17.369,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,27.89,882.09,0.0 +gfx950,256,32,2112,7168,ck,7,1,17.1099,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,56.63,906.1,0.0 +gfx950,256,48,2112,7168,ck,7,3,17.0967,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,85.01,917.47,0.0 +gfx950,256,64,2112,7168,ck,7,1,16.6867,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,116.13,950.93,0.0 +gfx950,256,80,2112,7168,ck,7,1,17.0679,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,141.92,940.37,0.0 +gfx950,256,96,2112,7168,ck,7,2,16.9001,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,171.99,960.49,0.0 +gfx950,256,112,2112,7168,ck,7,1,16.7774,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,202.12,978.38,0.0 +gfx950,256,128,2112,7168,ck,7,1,16.8979,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,229.35,982.19,0.0 +gfx950,256,144,2112,7168,ck,7,0,17.4362,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,250.05,962.32,0.0 +gfx950,256,160,2112,7168,ck,12,3,18.1582,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,266.79,934.1,0.0 +gfx950,256,176,2112,7168,ck,7,2,19.2838,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,276.34,889.03,0.0 +gfx950,256,192,2112,7168,ck,12,2,18.0709,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,321.69,958.78,0.0 +gfx950,256,208,2112,7168,ck,7,2,19.4853,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,323.21,898.54,0.0 +gfx950,256,224,2112,7168,ck,12,1,18.1425,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,373.83,975.09,0.0 +gfx950,256,240,2112,7168,ck,7,1,20.222,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,359.34,883.83,0.0 +gfx950,256,256,2112,7168,ck,12,3,19.3304,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,400.98,934.03,0.0 +gfx950,256,288,2112,7168,ck,12,0,19.761,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,441.27,932.12,0.0 +gfx950,256,320,2112,7168,ck,17,1,21.3295,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,454.25,880.67,0.0 +gfx950,256,352,2112,7168,ck,17,0,22.1299,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,481.6,865.29,0.0 +gfx950,256,384,2112,7168,ck,17,3,21.2294,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,547.67,919.17,0.0 +gfx950,256,416,2112,7168,ck,17,2,22.3197,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,564.32,890.6,0.0 +gfx950,256,448,2112,7168,ck,17,3,21.3596,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,635.05,947.7,0.0 +gfx950,256,480,2112,7168,ck,12,1,23.7782,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,611.2,866.63,0.0 +gfx950,256,512,2112,7168,ck,17,1,28.0778,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,552.11,746.91,0.0 +gfx950,256,544,2112,7168,ck,17,0,29.6478,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,555.56,719.65,0.0 +gfx950,256,576,2112,7168,ck,17,0,28.8581,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,604.33,751.98,0.0 +gfx950,256,608,2112,7168,ck,12,0,30.6611,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,600.4,719.65,0.0 +gfx950,256,640,2112,7168,ck,10,2,30.8536,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,628.05,726.97,0.0201 +gfx950,256,672,2112,7168,ck,17,0,32.1211,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,633.43,709.63,0.0 +gfx950,256,704,2112,7168,ck,17,0,31.5718,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,675.14,733.53,0.0 +gfx950,256,736,2112,7168,ck,17,1,31.4913,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,707.63,746.98,0.0 +gfx950,256,768,2112,7168,ck,17,3,31.0549,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,748.78,769.21,0.0 +gfx950,256,800,2112,7168,ck,17,3,32.8542,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,737.26,738.18,0.0 +gfx950,256,832,2112,7168,ck,17,0,32.1611,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,783.28,765.43,0.0 +gfx950,256,864,2112,7168,ck,17,0,33.1831,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,788.35,752.84,0.0 +gfx950,256,896,2112,7168,ck,17,3,32.5403,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,833.7,778.91,0.0 +gfx950,256,928,2112,7168,ck,17,1,34.0573,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,825.01,754.92,0.0 +gfx950,256,960,2112,7168,ck,17,0,33.349,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,871.59,781.89,0.0 +gfx950,256,992,2112,7168,ck,15,2,41.9166,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,716.55,630.77,0.0175 +gfx950,256,1024,2112,7168,ck,15,2,41.7085,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,743.36,642.66,0.0022 +gfx950,256,1088,2112,7168,ck,15,1,40.9809,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,803.84,671.86,0.0036 +gfx950,256,1152,2112,7168,ck,15,0,40.8235,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,854.41,692.31,0.0036 +gfx950,256,1216,2112,7168,ck,15,3,42.2342,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,871.75,686.45,0.0044 +gfx950,256,1280,2112,7168,ck,15,1,43.6734,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,887.39,680.52,0.0042 +gfx950,256,1344,2112,7168,ck,15,1,45.0358,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,903.57,676.12,0.0055 +gfx950,256,1408,2112,7168,ck,15,0,46.1981,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,922.78,674.89,0.0058 +gfx950,256,1472,2112,7168,ck,15,0,46.921,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,949.87,680.03,0.0064 +gfx950,256,1536,2112,7168,ck,17,0,55.8902,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,832.1,583.95,0.0 +gfx950,256,1600,2112,7168,ck,15,2,56.4283,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,858.51,591.3,0.0067 +gfx950,256,1664,2112,7168,ck,2,2,57.368,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,878.22,594.32,0.0004 +gfx950,256,1728,2112,7168,ck,15,1,57.7674,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,905.7,602.83,0.0078 +gfx950,256,1792,2112,7168,ck,2,1,59.3469,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,914.24,599.08,0.0006 +gfx950,256,1856,2112,7168,ck,15,3,60.2407,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,932.85,602.29,0.0081 +gfx950,256,1920,2112,7168,ck,15,1,62.5154,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,929.9,592.04,0.008 +gfx950,256,1984,2112,7168,ck,15,3,64.5853,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,930.1,584.35,0.0082 +gfx950,256,2048,2112,7168,ck,17,1,73.8185,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,840.01,521.14,0.0 +gfx950,256,4096,2112,7168,ck,15,0,122.5407,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1012.05,504.33,0.0085 +gfx950,256,8192,2112,7168,ck,15,3,223.4002,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1110.27,485.51,0.0091 +gfx950,256,16384,2112,7168,ck,15,1,411.1387,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1206.57,490.8,0.0095 +gfx950,256,32768,2112,7168,ck,17,0,804.4843,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,1233.26,482.83,0.0 +gfx950,256,16,2240,7168,ck,7,2,17.3187,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,29.67,937.87,0.0 +gfx950,256,32,2240,7168,ck,7,1,17.2052,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,59.73,954.89,0.0 +gfx950,256,48,2240,7168,ck,7,0,17.1856,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,89.69,966.82,0.0 +gfx950,256,64,2240,7168,ck,7,2,17.1082,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,120.13,982.09,0.0 +gfx950,256,80,2240,7168,ck,7,1,17.1957,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,149.4,987.93,0.0 +gfx950,256,96,2240,7168,ck,7,0,17.0763,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,180.53,1005.75,0.0 +gfx950,256,112,2240,7168,ck,7,2,17.0073,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,211.47,1020.79,0.0 +gfx950,256,128,2240,7168,ck,12,3,18.3162,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,224.41,958.02,0.0 +gfx950,256,144,2240,7168,ck,7,1,17.4925,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,264.35,1013.78,0.0 +gfx950,256,160,2240,7168,ck,12,0,18.2215,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,281.98,983.45,0.0 +gfx950,256,176,2240,7168,ck,7,0,19.5377,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,289.28,926.74,0.0 +gfx950,256,192,2240,7168,ck,12,3,18.244,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,337.95,1002.67,0.0 +gfx950,256,208,2240,7168,ck,7,1,19.7375,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,338.41,936.24,0.0 +gfx950,256,224,2240,7168,ck,12,3,18.0754,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,397.96,1032.65,0.0 +gfx950,256,240,2240,7168,ck,17,1,22.0355,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,349.76,855.52,0.0 +gfx950,256,256,2240,7168,ck,17,1,21.1511,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,388.67,900.1,0.0 +gfx950,256,288,2240,7168,ck,12,0,19.8488,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,465.94,977.94,0.0 +gfx950,256,320,2240,7168,ck,17,2,21.2632,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,483.28,930.42,0.0 +gfx950,256,352,2240,7168,ck,17,1,22.208,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,508.99,907.62,0.0 +gfx950,256,384,2240,7168,ck,17,2,21.4024,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,576.16,959.2,0.0 +gfx950,256,416,2240,7168,ck,17,1,22.4706,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,594.5,930.19,0.0 +gfx950,256,448,2240,7168,ck,17,2,21.5077,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,668.9,989.16,0.0 +gfx950,256,480,2240,7168,ck,10,3,29.6061,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,520.64,731.18,0.0197 +gfx950,256,512,2240,7168,ck,15,2,31.3316,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,524.76,702.81,0.0005 +gfx950,256,544,2240,7168,ck,17,2,29.8347,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,585.54,750.56,0.0 +gfx950,256,576,2240,7168,ck,17,0,29.2116,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,633.2,779.33,0.0 +gfx950,256,608,2240,7168,ck,12,3,31.1053,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,627.69,743.87,0.0 +gfx950,256,640,2240,7168,ck,17,3,31.3436,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,655.7,750.11,0.0 +gfx950,256,672,2240,7168,ck,17,2,32.4726,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,664.55,735.51,0.0 +gfx950,256,704,2240,7168,ck,17,3,31.5542,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,716.46,768.73,0.0 +gfx950,256,736,2240,7168,ck,17,1,32.7615,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,721.42,751.77,0.0 +gfx950,256,768,2240,7168,ck,17,0,31.996,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,770.8,781.41,0.0 +gfx950,256,800,2240,7168,ck,17,2,33.3508,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,770.3,760.84,0.0 +gfx950,256,832,2240,7168,ck,17,1,32.6843,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,817.45,787.76,0.0 +gfx950,256,864,2240,7168,ck,17,2,34.2322,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,810.5,763.03,0.0 +gfx950,256,896,2240,7168,ck,17,1,33.0197,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,871.39,802.34,0.0 +gfx950,256,928,2240,7168,ck,15,2,41.585,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,716.62,646.04,0.0173 +gfx950,256,960,2240,7168,ck,15,1,39.9713,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,771.26,681.45,0.0025 +gfx950,256,992,2240,7168,ck,15,1,43.5147,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,732.07,634.52,0.0179 +gfx950,256,1024,2240,7168,ck,2,2,44.5432,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,738.23,628.24,0.0001 +gfx950,256,1088,2240,7168,ck,15,2,41.8467,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,834.92,686.54,0.0035 +gfx950,256,1152,2240,7168,ck,15,1,42.3981,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,872.53,695.19,0.0043 +gfx950,256,1216,2240,7168,ck,15,1,44.3198,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,881.07,681.87,0.0047 +gfx950,256,1280,2240,7168,ck,15,0,46.0014,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,893.54,673.15,0.0057 +gfx950,256,1344,2240,7168,ck,15,3,46.7258,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,923.67,678.67,0.006 +gfx950,256,1408,2240,7168,ck,15,3,55.5671,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,813.69,584.1,0.0061 +gfx950,256,1472,2240,7168,ck,17,0,56.1099,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,842.45,591.73,0.0 +gfx950,256,1536,2240,7168,ck,15,0,57.0356,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,864.81,595.2,0.0071 +gfx950,256,1600,2240,7168,ck,15,0,57.9207,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,887.08,598.98,0.0072 +gfx950,256,1664,2240,7168,ck,15,0,58.3656,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,915.53,607.18,0.008 +gfx950,256,1728,2240,7168,ck,15,0,59.5866,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,931.26,607.25,0.0078 +gfx950,256,1792,2240,7168,ck,15,1,60.5927,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,949.72,609.47,0.0083 +gfx950,256,1856,2240,7168,ck,17,1,62.6283,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,951.66,601.57,0.0 +gfx950,256,1920,2240,7168,ck,17,2,69.6783,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,884.87,551.4,0.0 +gfx950,256,1984,2240,7168,ck,15,0,69.3874,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,918.2,564.45,0.0079 +gfx950,256,2048,2240,7168,ck,17,0,75.0354,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,876.48,531.9,0.0 +gfx950,256,4096,2240,7168,ck,15,0,125.1988,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1050.6,509.32,0.0092 +gfx950,256,8192,2240,7168,ck,15,0,228.9556,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1148.99,486.89,0.0096 +gfx950,256,16384,2240,7168,ck,15,3,434.2124,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1211.7,476.49,0.0095 +gfx950,256,32768,2240,7168,ck,15,2,834.7736,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1260.54,476.46,0.0099 +gfx950,256,16,3072,1536,ck,7,3,6.4633,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.36,749.07,0.0 +gfx950,256,16,3072,3072,asm,5,6,7.3727,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,40.96,1300.02,0.0003 +gfx950,256,32,3072,1536,ck,7,3,6.5001,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.46,763.73,0.0 +gfx950,256,32,3072,3072,asm,15,8,8.2748,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,72.99,1176.11,0.0006 +gfx950,256,48,3072,1536,ck,7,0,6.5182,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,69.5,780.47,0.0 +gfx950,256,48,3072,3072,asm,4,5,9.5269,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,95.1,1037.02,0.0001 +gfx950,256,64,3072,1536,ck,6,3,6.4157,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,94.14,812.09,0.0 +gfx950,256,64,3072,3072,asm,3,4,9.2705,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,130.3,1081.6,0.0001 +gfx950,256,80,3072,1536,ck,7,1,6.2015,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,121.74,859.95,0.0 +gfx950,256,80,3072,3072,asm,12,5,9.6381,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,156.66,1055.65,0.0001 +gfx950,256,96,3072,1536,ck,7,1,6.1725,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,146.78,883.9,0.0 +gfx950,256,96,3072,3072,asm,19,4,9.2652,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,195.56,1114.05,0.0001 +gfx950,256,112,3072,1536,ck,7,3,6.5959,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,160.25,845.79,0.0 +gfx950,256,112,3072,3072,ck,7,0,9.9923,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,211.56,1047.74,0.0 +gfx950,256,128,3072,1536,ck,7,0,6.2613,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,192.92,910.61,0.0 +gfx950,256,128,3072,3072,ck,7,0,10.3718,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,232.93,1023.62,0.0 +gfx950,256,144,3072,1536,ck,12,1,6.338,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,214.41,918.98,0.0 +gfx950,256,144,3072,3072,ck,7,0,10.2283,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,265.72,1052.4,0.0 +gfx950,256,160,3072,1536,ck,12,0,6.0824,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,248.25,977.8,0.0 +gfx950,256,160,3072,3072,ck,12,1,9.8728,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,305.88,1105.23,0.0 +gfx950,256,176,3072,1536,ck,6,3,7.3433,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,226.19,826.64,0.0 +gfx950,256,176,3072,3072,ck,12,1,11.0912,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,299.51,997.11,0.0 +gfx950,256,192,3072,1536,ck,6,0,6.9211,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,261.8,894.82,0.0 +gfx950,256,192,3072,3072,ck,12,3,10.5231,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,344.37,1064.96,0.0 +gfx950,256,208,3072,1536,ck,12,1,7.2509,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,270.72,871.07,0.0 +gfx950,256,208,3072,3072,ck,17,2,11.654,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,336.87,974.27,0.0 +gfx950,256,224,3072,1536,ck,12,1,7.2036,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,293.45,893.85,0.0 +gfx950,256,224,3072,3072,ck,12,3,11.5304,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,366.67,997.5,0.0 +gfx950,256,240,3072,1536,ck,11,1,7.2241,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,313.52,908.32,0.0 +gfx950,256,240,3072,3072,ck,17,3,11.735,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,386.01,992.67,0.0 +gfx950,256,256,3072,1536,ck,11,3,7.3204,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,330.03,913.16,0.0 +gfx950,256,256,3072,3072,ck,17,2,11.4432,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,422.25,1030.87,0.0 +gfx950,256,288,3072,1536,ck,12,3,7.4073,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,366.92,935.62,0.0 +gfx950,256,288,3072,3072,ck,12,0,11.9257,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,455.81,1013.89,0.0 +gfx950,256,320,3072,1536,ck,17,0,7.4931,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,403.02,957.71,0.0 +gfx950,256,320,3072,3072,ck,17,0,11.7025,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,516.11,1058.43,0.0 +gfx950,256,352,3072,1536,ck,10,0,9.4059,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,353.17,789.07,0.0212 +gfx950,256,352,3072,3072,asm,8,1,14.6095,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,454.76,868.01,0.0 +gfx950,256,384,3072,1536,ck,12,3,9.3019,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,389.58,824.32,0.0 +gfx950,256,384,3072,3072,asm,8,1,14.6577,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,494.47,885.28,0.0 +gfx950,256,416,3072,1536,asm,8,1,9.7878,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,401.1,808.5,0.0 +gfx950,256,416,3072,3072,asm,8,1,14.702,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,534.06,902.67,0.0 +gfx950,256,448,3072,1536,ck,12,1,9.7709,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,432.7,835.05,0.0 +gfx950,256,448,3072,3072,asm,8,1,14.7225,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,574.34,921.44,0.0 +gfx950,256,480,3072,1536,ck,12,3,10.0889,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,448.99,833.09,0.0 +gfx950,256,480,3072,3072,asm,8,1,14.8006,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,612.12,936.51,0.0 +gfx950,256,512,3072,1536,ck,12,2,10.3534,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,466.69,835.55,0.0 +gfx950,256,512,3072,3072,asm,16,1,15.8579,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,609.39,892.66,0.0 +gfx950,256,544,3072,1536,ck,11,3,10.5019,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,488.85,847.13,0.0 +gfx950,256,544,3072,3072,asm,16,1,15.8752,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,646.77,910.27,0.0 +gfx950,256,576,3072,1536,ck,17,1,10.5455,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,515.46,866.94,0.0 +gfx950,256,576,3072,3072,asm,16,1,15.9596,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,681.2,923.93,0.0 +gfx950,256,608,3072,1536,ck,11,0,10.6701,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,537.75,879.84,0.0 +gfx950,256,608,3072,3072,asm,16,1,16.0084,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,716.85,939.54,0.0 +gfx950,256,640,3072,1536,ck,17,1,10.7885,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,559.84,892.97,0.0 +gfx950,256,640,3072,3072,asm,16,1,16.2216,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,744.66,945.37,0.0 +gfx950,256,672,3072,1536,ck,11,3,10.9913,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,576.98,898.85,0.0 +gfx950,256,672,3072,3072,asm,24,1,17.2482,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,735.36,906.2,0.0 +gfx950,256,704,3072,1536,asm,24,1,12.1215,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,548.1,835.32,0.0 +gfx950,256,704,3072,3072,asm,24,1,17.5391,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,757.6,907.99,0.0 +gfx950,256,736,3072,1536,asm,24,1,11.7277,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,592.25,884.32,0.0 +gfx950,256,736,3072,3072,asm,24,1,17.5502,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,791.53,924.22,0.0 +gfx950,256,768,3072,1536,asm,24,1,11.8225,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,613.05,898.02,0.0 +gfx950,256,768,3072,3072,asm,24,1,17.4237,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,831.94,947.85,0.0 +gfx950,256,800,3072,1536,asm,24,1,12.0428,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,626.91,902.0,0.0 +gfx950,256,800,3072,3072,asm,24,1,17.7973,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,848.41,944.52,0.0 +gfx950,256,832,3072,1536,asm,32,1,13.235,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,593.26,839.32,0.0 +gfx950,256,832,3072,3072,asm,32,1,19.303,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,813.53,886.13,0.0 +gfx950,256,864,3072,1536,asm,32,1,13.3818,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,609.31,848.47,0.0 +gfx950,256,864,3072,3072,asm,32,1,19.4672,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,837.69,893.8,0.0 +gfx950,256,896,3072,1536,asm,32,1,12.9966,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,650.61,892.53,0.0 +gfx950,256,896,3072,3072,asm,32,1,19.4031,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,871.58,911.95,0.0 +gfx950,256,928,3072,1536,asm,32,1,13.4375,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,651.74,881.54,0.0 +gfx950,256,928,3072,3072,asm,32,1,19.6687,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,890.52,914.63,0.0 +gfx950,256,960,3072,1536,asm,32,1,13.6336,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,664.51,886.88,0.0 +gfx950,256,960,3072,3072,asm,32,1,19.8975,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,910.64,918.94,0.0 +gfx950,256,992,3072,1536,ck,15,1,14.2672,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,656.17,864.72,0.0137 +gfx950,256,992,3072,3072,ck,14,3,22.8957,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,817.77,811.48,0.0001 +gfx950,256,1024,3072,1536,ck,15,3,14.2212,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,679.53,884.8,0.0014 +gfx950,256,1024,3072,3072,ck,14,3,22.9132,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,843.5,823.73,0.0001 +gfx950,256,1088,3072,1536,ck,14,0,14.3467,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,715.68,911.32,0.0 +gfx950,256,1088,3072,3072,ck,14,3,22.4901,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,913.08,865.46,0.0001 +gfx950,256,1152,3072,1536,ck,14,3,14.3806,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,755.99,943.35,0.0 +gfx950,256,1152,3072,3072,ck,14,0,22.6081,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,961.75,887.03,0.0001 +gfx950,256,1216,3072,1536,ck,14,3,14.6973,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,780.8,956.47,0.0 +gfx950,256,1216,3072,3072,ck,14,2,23.0364,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,996.3,896.14,0.0001 +gfx950,256,1280,3072,1536,ck,14,1,14.8836,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,811.6,977.52,0.0 +gfx950,256,1280,3072,3072,ck,14,1,23.4477,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1030.34,905.58,0.0001 +gfx950,256,1344,3072,1536,ck,14,3,15.1183,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,838.96,994.85,0.0 +gfx950,256,1344,3072,3072,ck,14,0,23.8751,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1062.49,914.07,0.0001 +gfx950,256,1408,3072,1536,ck,14,0,18.3176,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,725.4,847.93,0.0 +gfx950,256,1408,3072,3072,ck,14,3,29.2608,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,908.22,765.98,0.0001 +gfx950,256,1472,3072,1536,ck,14,1,18.5424,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,749.18,864.16,0.0001 +gfx950,256,1472,3072,3072,ck,14,3,29.6273,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,937.75,776.42,0.0001 +gfx950,256,1536,3072,1536,ck,14,0,18.6045,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,779.14,887.69,0.0001 +gfx950,256,1536,3072,3072,ck,14,1,29.4698,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,983.75,800.58,0.0002 +gfx950,256,1600,3072,1536,ck,14,2,18.8,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,803.16,904.61,0.0001 +gfx950,256,1600,3072,3072,ck,14,3,29.7895,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1013.75,811.79,0.0002 +gfx950,256,1664,3072,1536,ck,14,0,18.9387,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,829.17,923.93,0.0001 +gfx950,256,1664,3072,3072,ck,14,0,30.0458,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1045.3,824.49,0.0001 +gfx950,256,1728,3072,1536,ck,14,3,19.2012,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,849.29,936.9,0.0002 +gfx950,256,1728,3072,3072,ck,14,2,30.4353,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1071.61,833.32,0.0002 +gfx950,256,1792,3072,1536,ck,14,1,19.3448,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,874.21,955.36,0.0001 +gfx950,256,1792,3072,3072,ck,14,0,30.8284,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1097.13,841.83,0.0002 +gfx950,256,1856,3072,1536,ck,14,1,19.4996,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,898.24,972.98,0.0002 +gfx950,256,1856,3072,3072,ck,14,0,31.8273,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1100.65,833.94,0.0002 +gfx950,256,1920,3072,1536,ck,14,2,19.5725,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,925.76,994.47,0.0002 +gfx950,256,1920,3072,3072,ck,14,0,31.7613,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1140.97,854.24,0.0002 +gfx950,256,1984,3072,1536,ck,14,1,20.1808,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,927.78,988.85,0.0002 +gfx950,256,1984,3072,3072,ck,14,3,33.9609,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1102.64,816.28,0.0002 +gfx950,256,2048,3072,1536,ck,14,2,20.3677,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,948.92,1003.9,0.0001 +gfx950,256,2048,3072,3072,ck,14,3,33.9279,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1139.32,834.46,0.0002 +gfx950,256,4096,3072,1536,ck,13,0,32.8903,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1175.26,1099.89,0.0 +gfx950,256,4096,3072,3072,ck,13,1,56.2027,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1375.55,839.57,0.0 +gfx950,256,8192,3072,1536,ck,13,0,60.7603,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1272.37,1113.11,0.0 +gfx950,256,8192,3072,3072,ck,13,0,98.9536,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1562.54,858.33,0.0 +gfx950,256,16384,3072,1536,ck,13,2,113.3639,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1363.92,1151.58,0.0 +gfx950,256,16384,3072,3072,ck,13,1,188.8317,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1637.64,849.6,0.0 +gfx950,256,32768,3072,1536,ck,13,0,206.2023,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1499.68,1243.33,0.0 +gfx950,256,32768,3072,3072,ck,13,1,358.9075,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1723.22,867.71,0.0 +gfx950,256,16,4096,512,ck,7,3,3.0553,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.96,731.98,0.0 +gfx950,256,16,4096,3072,asm,5,6,8.5265,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,47.22,1496.88,0.0001 +gfx950,256,32,4096,512,ck,7,1,3.352,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.04,708.74,0.0 +gfx950,256,32,4096,3072,asm,5,6,9.3323,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,86.29,1386.94,0.0001 +gfx950,256,48,4096,512,ck,7,2,3.4163,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,58.93,736.16,0.0 +gfx950,256,48,4096,3072,asm,13,6,9.3261,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,129.52,1407.19,0.0002 +gfx950,256,64,4096,512,ck,7,0,3.5282,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,76.08,752.28,0.0 +gfx950,256,64,4096,3072,ck,12,0,9.8624,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,163.31,1348.94,0.0 +gfx950,256,80,4096,512,ck,7,1,3.9461,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,85.03,707.91,0.0 +gfx950,256,80,4096,3072,asm,11,4,10.0921,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,199.49,1336.1,0.0001 +gfx950,256,96,4096,512,ck,7,1,3.9483,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,101.98,742.78,0.0 +gfx950,256,96,4096,3072,ck,12,0,9.741,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,248.02,1402.76,0.0 +gfx950,256,112,4096,512,ck,6,1,4.0039,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,117.33,767.25,0.0 +gfx950,256,112,4096,3072,ck,7,0,10.2261,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,275.63,1353.84,0.0 +gfx950,256,128,4096,512,ck,7,0,3.933,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,136.5,816.49,0.0 +gfx950,256,128,4096,3072,ck,12,1,9.764,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,329.91,1436.37,0.0 +gfx950,256,144,4096,512,ck,7,1,4.6145,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,130.89,726.09,0.0 +gfx950,256,144,4096,3072,ck,17,0,11.6095,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,312.15,1223.56,0.0 +gfx950,256,160,4096,512,ck,7,0,4.7158,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,142.31,740.02,0.0 +gfx950,256,160,4096,3072,ck,17,0,11.648,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,345.68,1234.99,0.0 +gfx950,256,176,4096,512,ck,7,1,4.6429,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,158.99,781.64,0.0 +gfx950,256,176,4096,3072,ck,17,1,11.723,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,377.82,1242.46,0.0 +gfx950,256,192,4096,512,ck,7,0,4.6761,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,172.22,805.87,0.0 +gfx950,256,192,4096,3072,ck,17,0,11.4303,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,422.72,1290.04,0.0 +gfx950,256,208,4096,512,ck,11,2,4.762,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,183.2,820.58,0.0 +gfx950,256,208,4096,3072,ck,17,2,11.7962,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,443.74,1265.31,0.0 +gfx950,256,224,4096,512,ck,11,2,4.8073,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,195.44,841.81,0.0 +gfx950,256,224,4096,3072,ck,17,3,11.8189,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,476.96,1278.13,0.0 +gfx950,256,240,4096,512,ck,11,1,4.7521,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,211.83,880.9,0.0 +gfx950,256,240,4096,3072,ck,17,1,11.8801,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,508.4,1286.71,0.0 +gfx950,256,256,4096,512,ck,11,2,4.7337,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,226.83,913.74,0.0 +gfx950,256,256,4096,3072,ck,17,2,11.5786,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,556.41,1335.78,0.0 +gfx950,256,288,4096,512,ck,12,2,5.7243,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,211.02,804.27,0.0 +gfx950,256,288,4096,3072,asm,8,1,14.4923,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,500.11,1092.09,0.0 +gfx950,256,320,4096,512,ck,12,3,5.7755,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,232.39,845.37,0.0 +gfx950,256,320,4096,3072,asm,8,1,14.6266,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,550.58,1106.71,0.0 +gfx950,256,352,4096,512,ck,12,0,5.8203,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,253.66,886.72,0.0 +gfx950,256,352,4096,3072,asm,8,1,14.722,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,601.71,1124.02,0.0 +gfx950,256,384,4096,512,ck,12,0,5.8685,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,274.45,926.9,0.0 +gfx950,256,384,4096,3072,asm,8,1,14.8001,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,652.95,1142.44,0.0 +gfx950,256,416,4096,512,ck,7,0,6.0603,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,287.91,943.52,0.0 +gfx950,256,416,4096,3072,asm,16,1,15.6767,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,667.81,1101.55,0.0 +gfx950,256,448,4096,512,ck,15,0,6.0223,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,312.02,995.72,0.0001 +gfx950,256,448,4096,3072,asm,16,1,15.7201,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,717.19,1121.44,0.0 +gfx950,256,480,4096,512,ck,15,1,6.0166,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,334.62,1042.96,0.0122 +gfx950,256,480,4096,3072,asm,16,1,15.7477,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,767.07,1142.37,0.0 +gfx950,256,512,4096,512,ck,9,2,6.0597,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,354.39,1081.51,0.0 +gfx950,256,512,4096,3072,asm,16,1,15.937,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,808.49,1151.41,0.0 +gfx950,256,544,4096,512,ck,11,2,6.3309,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,360.41,1079.17,0.0 +gfx950,256,544,4096,3072,asm,24,1,17.1008,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,800.56,1094.13,0.0 +gfx950,256,576,4096,512,ck,9,3,6.3106,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,382.84,1126.78,0.0 +gfx950,256,576,4096,3072,asm,24,1,17.4405,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,831.14,1093.49,0.0 +gfx950,256,608,4096,512,ck,9,3,6.3107,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,404.1,1170.9,0.0 +gfx950,256,608,4096,3072,asm,24,1,17.2655,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,886.21,1125.45,0.0 +gfx950,256,640,4096,512,ck,12,1,6.5354,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,410.74,1173.26,0.0 +gfx950,256,640,4096,3072,asm,24,1,17.8108,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,904.29,1111.23,0.0 +gfx950,256,672,4096,512,ck,9,2,6.5833,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,428.14,1207.03,0.0 +gfx950,256,672,4096,3072,asm,32,1,19.4713,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,868.53,1034.98,0.0 +gfx950,256,704,4096,512,ck,11,2,6.711,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,439.99,1225.57,0.0 +gfx950,256,704,4096,3072,asm,32,1,19.1316,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,926.05,1072.19,0.0 +gfx950,256,736,4096,512,ck,11,2,7.0987,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,434.87,1197.87,0.0 +gfx950,256,736,4096,3072,asm,32,1,19.2547,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,961.95,1084.06,0.0 +gfx950,256,768,4096,512,ck,9,0,6.8799,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,468.21,1276.45,0.0 +gfx950,256,768,4096,3072,asm,32,1,19.3192,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,1000.42,1099.1,0.0 +gfx950,256,800,4096,512,ck,9,3,8.1419,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,412.12,1112.81,0.0 +gfx950,256,800,4096,3072,asm,40,1,23.1091,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,871.2,934.44,0.0 +gfx950,256,832,4096,512,ck,16,0,8.2291,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,424.06,1134.86,0.0 +gfx950,256,832,4096,3072,ck,14,2,22.6787,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,923.24,968.07,0.0001 +gfx950,256,864,4096,512,ck,14,2,8.1323,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,445.62,1182.62,0.0 +gfx950,256,864,4096,3072,ck,14,3,23.4387,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,927.67,952.06,0.0001 +gfx950,256,896,4096,512,ck,12,1,8.2695,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,454.45,1196.68,0.0 +gfx950,256,896,4096,3072,ck,14,1,23.3659,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,965.02,970.45,0.0001 +gfx950,256,928,4096,512,ck,16,2,8.5579,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,454.82,1188.9,0.0 +gfx950,256,928,4096,3072,asm,40,1,23.5093,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,993.39,979.86,0.0 +gfx950,256,960,4096,512,ck,16,1,8.4661,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,475.61,1234.69,0.0 +gfx950,256,960,4096,3072,ck,14,3,23.7112,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1018.89,986.72,0.0001 +gfx950,256,992,4096,512,ck,16,2,8.7551,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,475.24,1225.74,0.0 +gfx950,256,992,4096,3072,asm,40,1,23.5538,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1059.89,1008.62,0.0 +gfx950,256,1024,4096,512,ck,16,3,8.7835,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,488.98,1253.49,0.0 +gfx950,256,1024,4096,3072,asm,40,1,23.8252,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1081.62,1012.26,0.0 +gfx950,256,1088,4096,512,ck,11,3,10.1949,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,447.62,1134.6,0.0 +gfx950,256,1088,4096,3072,ck,14,1,29.8181,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,918.25,832.99,0.0002 +gfx950,256,1152,4096,512,ck,11,1,10.1493,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,476.08,1194.58,0.0 +gfx950,256,1152,4096,3072,ck,14,2,29.8302,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,971.87,856.82,0.0002 +gfx950,256,1216,4096,512,ck,9,0,10.3706,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,491.8,1222.8,0.0 +gfx950,256,1216,4096,3072,ck,14,2,29.9089,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1023.16,878.67,0.0001 +gfx950,256,1280,4096,512,ck,11,1,10.1941,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,526.65,1298.62,0.0 +gfx950,256,1280,4096,3072,ck,14,0,30.2437,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1065.09,892.78,0.0002 +gfx950,256,1344,4096,512,ck,11,3,10.9237,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,516.05,1262.88,0.0 +gfx950,256,1344,4096,3072,ck,14,0,31.1831,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1084.65,889.0,0.0001 +gfx950,256,1408,4096,512,ck,14,3,11.0822,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,532.89,1295.08,0.0001 +gfx950,256,1408,4096,3072,ck,14,1,31.7025,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1117.69,897.17,0.0002 +gfx950,256,1472,4096,512,ck,14,1,11.2016,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,551.17,1331.01,0.0001 +gfx950,256,1472,4096,3072,ck,14,1,31.8961,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1161.4,914.33,0.0002 +gfx950,256,1536,4096,512,ck,14,3,11.3212,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,569.06,1366.15,0.0002 +gfx950,256,1536,4096,3072,ck,13,1,32.3752,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1193.96,923.07,0.0 +gfx950,256,1600,4096,512,ck,11,1,12.1644,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,551.68,1317.25,0.0 +gfx950,256,1600,4096,3072,ck,13,3,33.1053,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1216.28,924.48,0.0 +gfx950,256,1664,4096,512,ck,11,1,12.2937,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,567.72,1348.71,0.0 +gfx950,256,1664,4096,3072,ck,13,3,33.6312,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1245.15,931.46,0.0 +gfx950,256,1728,4096,512,ck,11,1,12.4573,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,581.81,1375.71,0.0 +gfx950,256,1728,4096,3072,ck,13,3,33.2495,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1307.89,963.84,0.0 +gfx950,256,1792,4096,512,ck,11,3,12.5727,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,597.82,1407.39,0.0 +gfx950,256,1792,4096,3072,ck,13,3,33.6704,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1339.37,973.2,0.0 +gfx950,256,1856,4096,512,ck,11,0,13.3033,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,585.17,1371.97,0.0 +gfx950,256,1856,4096,3072,ck,13,2,35.6071,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1311.75,940.51,0.0 +gfx950,256,1920,4096,512,ck,11,3,13.6155,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,591.46,1381.43,0.0 +gfx950,256,1920,4096,3072,ck,13,3,37.0853,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1302.9,922.46,0.0 +gfx950,256,1984,4096,512,ck,16,0,13.8619,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,600.31,1397.06,0.0 +gfx950,256,1984,4096,3072,ck,13,3,38.9825,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1280.81,896.06,0.0 +gfx950,256,2048,4096,512,ck,9,2,14.4383,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,594.94,1379.87,0.0 +gfx950,256,2048,4096,3072,ck,13,1,39.4437,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1306.66,903.86,0.0 +gfx950,256,4096,4096,512,ck,16,3,23.585,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,728.42,1600.54,0.0 +gfx950,256,4096,4096,3072,ck,13,2,71.402,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1443.65,822.39,0.0 +gfx950,256,8192,4096,512,ck,16,1,41.9105,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,819.84,1751.36,0.0 +gfx950,256,8192,4096,3072,ck,13,2,132.4593,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1556.39,791.62,0.0 +gfx950,256,16384,4096,512,ck,16,2,78.2029,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,878.73,1850.36,0.0 +gfx950,256,16384,4096,3072,ck,13,3,248.0474,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1662.25,794.74,0.0 +gfx950,256,32768,4096,512,ck,16,2,143.2204,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,959.63,2006.07,0.0 +gfx950,256,32768,4096,3072,ck,13,3,481.3425,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1713.2,792.95,0.0 +gfx950,256,16,4608,7168,asm,22,7,10.2718,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,102.9,3241.13,0.0059 +gfx950,256,32,4608,7168,asm,29,6,11.6205,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,181.91,2887.52,0.004 +gfx950,256,48,4608,7168,asm,13,6,13.7652,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,230.36,2456.67,0.0037 +gfx950,256,64,4608,7168,asm,29,6,12.857,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,328.84,2650.6,0.0036 +gfx950,256,80,4608,7168,asm,37,6,14.7725,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,357.75,2324.65,0.0047 +gfx950,256,96,4608,7168,asm,37,6,15.4561,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,410.31,2238.79,0.0039 +gfx950,256,112,4608,7168,asm,18,3,17.8925,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,413.51,1948.59,0.0004 +gfx950,256,128,4608,7168,asm,18,3,18.0058,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,469.61,1950.89,0.0004 +gfx950,256,144,4608,7168,asm,26,3,20.2285,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,470.26,1749.48,0.0004 +gfx950,256,160,4608,7168,asm,26,3,20.8025,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,508.09,1713.81,0.0004 +gfx950,256,176,4608,7168,asm,34,3,21.847,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,532.18,1643.87,0.0004 +gfx950,256,192,4608,7168,ck,17,3,21.9076,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,578.96,1651.29,0.0 +gfx950,256,208,4608,7168,asm,42,3,23.3806,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,587.69,1558.47,0.0004 +gfx950,256,224,4608,7168,ck,11,3,23.044,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,642.14,1592.61,0.0 +gfx950,256,240,4608,7168,asm,25,2,25.8285,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,613.84,1431.07,0.0 +gfx950,256,256,4608,7168,asm,33,2,26.9625,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,627.22,1380.6,0.0 +gfx950,256,288,4608,7168,asm,33,2,27.8307,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,683.61,1356.37,0.0 +gfx950,256,320,4608,7168,asm,8,1,29.3886,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,719.3,1302.31,0.0 +gfx950,256,352,4608,7168,asm,41,2,30.6762,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,758.02,1264.74,0.0 +gfx950,256,384,4608,7168,asm,16,1,30.5188,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,831.2,1288.44,0.0 +gfx950,256,416,4608,7168,asm,16,1,30.6272,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,897.28,1301.0,0.0 +gfx950,256,448,4608,7168,asm,16,1,31.1697,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,949.48,1295.17,0.0 +gfx950,256,480,4608,7168,asm,24,1,33.299,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,952.25,1228.1,0.0 +gfx950,256,512,4608,7168,asm,24,1,33.577,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,1007.32,1233.55,0.0 +gfx950,256,544,4608,7168,asm,24,1,33.9623,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,1058.14,1234.99,0.0 +gfx950,256,576,4608,7168,asm,32,1,36.4667,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,1043.44,1164.55,0.0 +gfx950,256,608,4608,7168,asm,32,1,39.2737,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,1022.69,1094.67,0.0 +gfx950,256,640,4608,7168,asm,32,1,37.7105,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,1121.14,1153.95,0.0 +gfx950,256,672,4608,7168,asm,32,1,38.6714,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,1147.94,1138.83,0.0 +gfx950,256,704,4608,7168,asm,40,1,45.5781,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1020.37,977.76,0.0 +gfx950,256,736,4608,7168,asm,40,1,47.3086,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1027.73,953.08,0.0 +gfx950,256,768,4608,7168,asm,40,1,46.0129,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1102.61,991.31,0.0 +gfx950,256,800,4608,7168,asm,40,1,47.395,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1115.06,973.46,0.0 +gfx950,256,832,4608,7168,ck,13,1,48.1976,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1140.35,968.13,0.0 +gfx950,256,864,4608,7168,asm,40,1,49.4742,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,1153.65,953.75,0.0 +gfx950,256,896,4608,7168,ck,13,0,49.2493,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1201.84,968.75,0.0 +gfx950,256,928,4608,7168,ck,14,2,61.0416,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1004.3,790.19,0.0001 +gfx950,256,960,4608,7168,ck,13,2,60.7415,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1044.06,802.73,0.0 +gfx950,256,992,4608,7168,ck,14,2,64.9887,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1008.36,758.33,0.0001 +gfx950,256,1024,4608,7168,ck,14,3,64.9234,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1041.93,767.17,0.0001 +gfx950,256,1088,4608,7168,ck,14,0,62.6762,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1146.74,811.41,0.0002 +gfx950,256,1152,4608,7168,ck,13,2,63.8589,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1191.71,812.8,0.0 +gfx950,256,1216,4608,7168,ck,14,3,64.0235,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1254.68,827.09,0.0001 +gfx950,256,1280,4608,7168,ck,13,3,65.4649,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1291.64,824.89,0.0 +gfx950,256,1344,4608,7168,ck,14,2,68.5828,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1294.57,802.68,0.0002 +gfx950,256,1408,4608,7168,ck,13,1,65.8692,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1412.08,851.67,0.0 +gfx950,256,1472,4608,7168,ck,13,3,69.1204,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1406.83,826.78,0.0 +gfx950,256,1536,4608,7168,ck,13,3,73.8412,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1374.15,788.12,0.0 +gfx950,256,1600,4608,7168,ck,13,2,74.9489,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1410.25,790.47,0.0 +gfx950,256,1664,4608,7168,ck,13,0,76.3776,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1439.22,789.41,0.0 +gfx950,256,1728,4608,7168,ck,13,1,77.1936,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1478.78,794.65,0.0 +gfx950,256,1792,4608,7168,ck,13,0,78.3613,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1510.7,796.19,0.0 +gfx950,256,1856,4608,7168,ck,14,1,99.0642,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1237.66,640.38,0.0002 +gfx950,256,1920,4608,7168,ck,14,2,99.9722,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1268.71,645.05,0.0001 +gfx950,256,1984,4608,7168,ck,13,3,101.9007,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1286.19,643.14,0.0 +gfx950,256,2048,4608,7168,ck,14,2,105.2531,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1285.39,632.61,0.0002 +gfx950,256,4096,4608,7168,ck,13,2,171.4096,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1578.58,584.21,0.0 +gfx950,256,8192,4608,7168,ck,13,0,305.5678,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1771.02,547.33,0.0 +gfx950,256,16384,4608,7168,ck,13,3,574.1814,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1885.0,525.04,0.0 +gfx950,256,32768,4608,7168,ck,13,2,1114.2846,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1942.65,511.45,0.0 +gfx950,256,16,7168,2048,ck,7,3,7.2132,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,65.13,2071.51,0.0 +gfx950,256,16,7168,2304,ck,7,0,8.067,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,65.51,2080.24,0.0 +gfx950,256,32,7168,2048,ck,7,0,7.222,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,130.09,2105.28,0.0 +gfx950,256,32,7168,2304,ck,7,3,8.253,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,128.07,2065.62,0.0 +gfx950,256,48,7168,2048,ck,6,3,7.9865,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,176.46,1936.58,0.0 +gfx950,256,48,7168,2304,ck,7,0,9.4657,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,167.49,1829.11,0.0 +gfx950,256,64,7168,2048,ck,6,1,8.335,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,225.44,1887.06,0.0 +gfx950,256,64,7168,2304,ck,12,0,9.5781,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,220.7,1835.44,0.0 +gfx950,256,80,7168,2048,ck,11,1,9.4105,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,249.59,1699.25,0.0 +gfx950,256,80,7168,2304,ck,17,2,9.9321,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,266.05,1796.83,0.0 +gfx950,256,96,7168,2048,ck,11,2,9.625,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,292.84,1688.62,0.0 +gfx950,256,96,7168,2304,ck,17,1,9.9563,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,318.48,1819.2,0.0 +gfx950,256,112,7168,2048,ck,11,1,9.4953,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,346.31,1739.29,0.0 +gfx950,256,112,7168,2304,ck,17,0,10.0093,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,369.59,1836.17,0.0 +gfx950,256,128,7168,2048,ck,11,0,9.471,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,396.8,1771.43,0.0 +gfx950,256,128,7168,2304,ck,17,3,10.0901,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,419.01,1847.85,0.0 +gfx950,256,144,7168,2048,asm,8,1,11.1728,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,378.41,1525.08,0.0 +gfx950,256,144,7168,2304,asm,8,1,12.1844,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,390.36,1552.09,0.0 +gfx950,256,160,7168,2048,asm,8,1,11.2214,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,418.63,1541.83,0.0 +gfx950,256,160,7168,2304,asm,8,1,12.1653,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,434.42,1576.41,0.0 +gfx950,256,176,7168,2048,asm,8,1,11.5129,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,448.83,1525.56,0.0 +gfx950,256,176,7168,2304,asm,8,1,12.5851,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,461.92,1544.98,0.0 +gfx950,256,192,7168,2048,asm,8,1,11.6453,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,484.07,1530.73,0.0 +gfx950,256,192,7168,2304,asm,8,1,12.6184,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,502.58,1562.0,0.0 +gfx950,256,208,7168,2048,asm,16,1,12.0625,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,506.27,1499.52,0.0 +gfx950,256,208,7168,2304,asm,16,1,13.1559,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,522.22,1518.42,0.0 +gfx950,256,224,7168,2048,ck,17,0,12.3478,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,532.62,1486.1,0.0 +gfx950,256,224,7168,2304,asm,16,1,13.3633,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,553.66,1514.78,0.0 +gfx950,256,240,7168,2048,asm,16,1,12.4016,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,568.19,1500.79,0.0 +gfx950,256,240,7168,2304,asm,16,1,13.2042,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,600.36,1553.19,0.0 +gfx950,256,256,7168,2048,asm,16,1,12.2335,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,614.39,1542.84,0.0 +gfx950,256,256,7168,2304,asm,16,1,13.6382,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,620.0,1523.29,0.0 +gfx950,256,288,7168,2048,asm,24,1,13.7055,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,616.96,1415.39,0.0 +gfx950,256,288,7168,2304,asm,24,1,14.4966,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,656.2,1469.82,0.0 +gfx950,256,320,7168,2048,asm,24,1,13.4623,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,697.89,1479.91,0.0 +gfx950,256,320,7168,2304,asm,24,1,14.5582,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,726.03,1500.18,0.0 +gfx950,256,352,7168,2048,asm,32,1,14.96,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,690.83,1366.79,0.0 +gfx950,256,352,7168,2304,asm,32,1,15.663,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,742.3,1428.36,0.0 +gfx950,256,384,7168,2048,asm,32,1,14.6886,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,767.55,1427.74,0.0 +gfx950,256,384,7168,2304,asm,32,1,16.4306,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,771.95,1394.04,0.0 +gfx950,256,416,7168,2048,ck,14,3,16.8019,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,726.93,1279.37,0.0001 +gfx950,256,416,7168,2304,ck,14,0,18.9316,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,725.8,1238.0,0.0 +gfx950,256,448,7168,2048,ck,14,0,16.6168,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,791.57,1325.17,0.0001 +gfx950,256,448,7168,2304,ck,14,2,18.9489,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,780.92,1264.97,0.0 +gfx950,256,480,7168,2048,ck,14,2,17.1416,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,822.14,1315.19,0.0001 +gfx950,256,480,7168,2304,asm,40,1,19.1807,_ZN5aiter43fp8gemm_bf16_blockscale_BpreShuffle_128x128E,826.58,1277.44,0.0 +gfx950,256,512,7168,2048,ck,14,0,17.088,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,879.7,1349.99,0.0002 +gfx950,256,512,7168,2304,ck,14,0,19.4496,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,869.5,1287.16,0.0 +gfx950,256,544,7168,2048,ck,14,1,17.8708,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,893.74,1320.2,0.0001 +gfx950,256,544,7168,2304,ck,14,1,20.1182,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,893.14,1270.85,0.0 +gfx950,256,576,7168,2048,ck,14,3,17.8731,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,946.19,1349.36,0.0002 +gfx950,256,576,7168,2304,ck,14,2,20.0657,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,948.15,1300.71,0.0 +gfx950,256,608,7168,2048,ck,14,0,21.9319,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,813.93,1123.55,0.0001 +gfx950,256,608,7168,2304,ck,14,0,24.7891,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,810.13,1074.35,0.0001 +gfx950,256,640,7168,2048,ck,14,3,21.8952,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,858.2,1149.38,0.0002 +gfx950,256,640,7168,2304,ck,14,1,24.582,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,859.95,1105.06,0.0001 +gfx950,256,672,7168,2048,ck,14,0,22.3856,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,881.37,1147.62,0.0001 +gfx950,256,672,7168,2304,ck,14,0,25.3952,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,874.03,1090.65,0.0001 +gfx950,256,704,7168,2048,ck,14,0,22.1673,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,932.43,1182.57,0.0001 +gfx950,256,704,7168,2304,ck,14,2,24.9362,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,932.51,1132.07,0.0001 +gfx950,256,736,7168,2048,ck,14,3,22.8087,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,947.4,1172.3,0.0002 +gfx950,256,736,7168,2304,ck,14,3,25.8221,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,941.45,1113.86,0.0001 +gfx950,256,768,7168,2048,ck,14,1,22.8406,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,987.21,1193.62,0.0002 +gfx950,256,768,7168,2304,ck,14,0,25.4961,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,994.94,1148.98,0.0002 +gfx950,256,800,7168,2048,ck,14,0,23.6361,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,993.74,1175.63,0.0002 +gfx950,256,800,7168,2304,ck,14,1,26.6837,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,990.27,1117.8,0.0001 +gfx950,256,832,7168,2048,ck,14,0,23.2242,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1051.82,1219.05,0.0003 +gfx950,256,832,7168,2304,ck,14,1,26.6718,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1030.34,1138.26,0.0002 +gfx950,256,864,7168,2048,ck,13,0,25.6049,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,990.71,1126.18,0.0 +gfx950,256,864,7168,2304,ck,13,3,28.3077,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1008.14,1091.29,0.0 +gfx950,256,896,7168,2048,ck,13,0,25.6653,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1024.99,1143.96,0.0 +gfx950,256,896,7168,2304,ck,13,0,27.9433,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1059.11,1124.58,0.0 +gfx950,256,928,7168,2048,ck,13,3,26.5242,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1027.22,1126.68,0.0 +gfx950,256,928,7168,2304,ck,13,0,29.6559,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1033.59,1077.59,0.0 +gfx950,256,960,7168,2048,ck,13,1,26.7867,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1052.23,1135.22,0.0 +gfx950,256,960,7168,2304,ck,13,1,29.2621,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1083.62,1110.29,0.0 +gfx950,256,992,7168,2048,ck,13,2,27.1511,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1072.71,1139.29,0.0 +gfx950,256,992,7168,2304,ck,13,0,30.3926,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1078.09,1086.51,0.0 +gfx950,256,1024,7168,2048,ck,13,2,26.9597,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1115.17,1166.83,0.0 +gfx950,256,1024,7168,2304,ck,13,2,30.2979,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1116.34,1107.48,0.0 +gfx950,256,1088,7168,2048,ck,13,2,27.2805,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1170.94,1191.54,0.0 +gfx950,256,1088,7168,2304,ck,13,3,30.6045,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1174.23,1131.19,0.0 +gfx950,256,1152,7168,2048,ck,13,1,27.4994,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1229.95,1220.19,0.0 +gfx950,256,1152,7168,2304,ck,13,1,30.6421,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1241.78,1164.55,0.0 +gfx950,256,1216,7168,2048,ck,14,1,33.3229,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1071.39,1038.42,0.0003 +gfx950,256,1216,7168,2304,ck,14,1,38.9553,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1031.04,943.37,0.0002 +gfx950,256,1280,7168,2048,ck,14,1,33.8453,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1110.37,1053.37,0.0003 +gfx950,256,1280,7168,2304,ck,14,1,38.5149,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1097.72,981.81,0.0003 +gfx950,256,1344,7168,2048,ck,14,1,35.0697,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1125.19,1046.49,0.0003 +gfx950,256,1344,7168,2304,ck,14,1,39.8326,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1114.48,976.07,0.0002 +gfx950,256,1408,7168,2048,ck,14,1,34.9055,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1184.31,1081.46,0.0003 +gfx950,256,1408,7168,2304,ck,14,0,40.0446,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1161.37,997.49,0.0002 +gfx950,256,1472,7168,2048,ck,13,2,37.8797,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1140.93,1024.22,0.0 +gfx950,256,1472,7168,2304,ck,13,0,41.5178,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1171.07,987.75,0.0 +gfx950,256,1536,7168,2048,ck,13,1,37.7413,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1194.9,1055.76,0.0 +gfx950,256,1536,7168,2304,ck,13,3,41.4718,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1223.34,1014.52,0.0 +gfx950,256,1600,7168,2048,ck,13,0,38.8337,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1209.68,1053.07,0.0 +gfx950,256,1600,7168,2304,ck,13,2,42.9042,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1231.77,1005.47,0.0 +gfx950,256,1664,7168,2048,ck,13,1,39.2573,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1244.49,1068.41,0.0 +gfx950,256,1664,7168,2304,ck,13,2,43.621,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1259.99,1013.37,0.0 +gfx950,256,1728,7168,2048,ck,13,0,39.8069,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1274.51,1080.0,0.0 +gfx950,256,1728,7168,2304,ck,13,3,44.4322,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1284.57,1018.83,0.0 +gfx950,256,1792,7168,2048,ck,14,2,45.809,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1148.54,961.39,0.0003 +gfx950,256,1792,7168,2304,ck,13,2,49.8498,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1187.37,929.47,0.0 +gfx950,256,1856,7168,2048,ck,14,2,46.3162,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1176.53,973.5,0.0003 +gfx950,256,1856,7168,2304,ck,14,1,52.4771,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1168.2,903.23,0.0003 +gfx950,256,1920,7168,2048,ck,13,2,46.9258,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1201.29,983.2,0.0 +gfx950,256,1920,7168,2304,ck,13,1,52.4563,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1208.97,923.89,0.0 +gfx950,256,1984,7168,2048,ck,13,0,48.5366,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1200.14,972.17,0.0 +gfx950,256,1984,7168,2304,ck,13,3,54.0152,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1213.21,916.94,0.0 +gfx950,256,2048,7168,2048,ck,13,1,49.7429,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1208.81,969.68,0.0 +gfx950,256,2048,7168,2304,ck,13,1,54.9991,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1229.94,919.9,0.0 +gfx950,256,4096,7168,2048,ck,13,1,84.9609,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1415.46,962.67,0.0 +gfx950,256,4096,7168,2304,ck,13,0,94.7138,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1428.42,893.98,0.0 +gfx950,256,8192,7168,2048,ck,13,0,157.5272,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1526.84,945.22,0.0 +gfx950,256,8192,7168,2304,ck,13,3,171.0665,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1581.74,893.39,0.0 +gfx950,256,16384,7168,2048,ck,13,1,291.9559,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1647.63,969.72,0.0 +gfx950,256,16384,7168,2304,ck,13,1,324.2426,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1669.02,891.75,0.0 +gfx950,256,32768,7168,2048,ck,13,1,556.7794,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1727.92,990.61,0.0 +gfx950,256,32768,7168,2304,ck,13,0,619.9048,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1745.96,906.23,0.0 +gfx950,256,16,8192,1536,ck,7,2,6.4719,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,62.22,1988.54,0.0 +gfx950,256,32,8192,1536,ck,7,2,6.2694,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,128.45,2098.5,0.0 +gfx950,256,48,8192,1536,ck,6,0,6.7969,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,177.72,1977.82,0.0 +gfx950,256,64,8192,1536,ck,12,1,6.8746,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,234.28,1997.18,0.0 +gfx950,256,80,8192,1536,ck,12,0,8.1128,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,248.16,1727.7,0.0 +gfx950,256,96,8192,1536,ck,11,3,7.6815,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,314.51,1862.04,0.0 +gfx950,256,112,8192,1536,ck,12,1,8.0886,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,348.46,1803.77,0.0 +gfx950,256,128,8192,1536,ck,11,0,8.3009,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,388.06,1792.18,0.0 +gfx950,256,144,8192,1536,asm,16,1,10.0466,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,360.71,1509.31,0.0 +gfx950,256,160,8192,1536,asm,8,1,9.5079,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,423.49,1624.98,0.0 +gfx950,256,176,8192,1536,asm,8,1,10.041,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,441.11,1567.26,0.0 +gfx950,256,192,8192,1536,asm,8,1,9.735,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,496.34,1645.97,0.0 +gfx950,256,208,8192,1536,ck,17,1,10.774,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,485.84,1513.85,0.0 +gfx950,256,224,8192,1536,asm,16,1,10.472,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,538.31,1584.89,0.0 +gfx950,256,240,8192,1536,ck,17,1,11.016,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,548.28,1532.65,0.0 +gfx950,256,256,8192,1536,asm,16,1,10.6076,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_64x128E,607.34,1618.69,0.0 +gfx950,256,288,8192,1536,asm,24,1,11.5847,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,625.63,1531.66,0.0 +gfx950,256,320,8192,1536,asm,24,1,11.8865,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_80x128E,677.5,1541.02,0.0 +gfx950,256,352,8192,1536,asm,32,1,13.3519,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,663.45,1414.84,0.0 +gfx950,256,384,8192,1536,asm,32,1,13.0874,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,738.4,1487.25,0.0 +gfx950,256,416,8192,1536,ck,14,0,15.0583,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,695.23,1330.67,0.0 +gfx950,256,448,8192,1536,ck,14,2,14.9824,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,752.5,1375.69,0.0 +gfx950,256,480,8192,1536,ck,14,3,15.2947,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,789.79,1385.09,0.0 +gfx950,256,512,8192,1536,ck,14,3,15.175,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,849.09,1433.8,0.0 +gfx950,256,544,8192,1536,ck,14,0,19.3452,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,707.68,1154.36,0.0001 +gfx950,256,576,8192,1536,ck,14,3,19.3504,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,749.11,1183.69,0.0001 +gfx950,256,608,8192,1536,ck,14,1,19.5196,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,783.87,1202.8,0.0001 +gfx950,256,640,8192,1536,ck,14,0,19.295,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,834.73,1246.53,0.0001 +gfx950,256,672,8192,1536,ck,14,2,19.9677,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,846.94,1233.25,0.0002 +gfx950,256,704,8192,1536,ck,14,0,19.791,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,895.19,1273.23,0.0001 +gfx950,256,736,8192,1536,ck,14,2,20.4392,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,906.2,1260.91,0.0002 +gfx950,256,768,8192,1536,ck,14,0,20.2904,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,952.54,1298.42,0.0002 +gfx950,256,800,8192,1536,ck,13,2,22.2979,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,902.89,1207.24,0.0 +gfx950,256,832,8192,1536,ck,13,0,22.2371,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,941.58,1236.33,0.0 +gfx950,256,864,8192,1536,ck,13,3,23.0882,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,941.75,1215.59,0.0 +gfx950,256,896,8192,1536,ck,13,3,22.8541,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,986.63,1253.13,0.0 +gfx950,256,928,8192,1536,ck,13,0,23.3637,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,999.58,1250.34,0.0 +gfx950,256,960,8192,1536,ck,13,0,23.2165,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1040.6,1282.97,0.0 +gfx950,256,992,8192,1536,ck,13,3,24.0286,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1038.95,1263.48,0.0 +gfx950,256,1024,8192,1536,ck,13,0,23.849,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1080.54,1297.04,0.0 +gfx950,256,1088,8192,1536,ck,14,1,28.527,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,959.81,1124.54,0.0001 +gfx950,256,1152,8192,1536,ck,14,2,28.3804,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1021.52,1170.76,0.0002 +gfx950,256,1216,8192,1536,ck,14,2,28.9229,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1058.04,1188.46,0.0002 +gfx950,256,1280,8192,1536,ck,14,1,30.0017,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1073.68,1183.95,0.0002 +gfx950,256,1344,8192,1536,ck,13,2,31.5777,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1071.1,1161.18,0.0 +gfx950,256,1408,8192,1536,ck,13,0,31.753,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1115.91,1190.89,0.0 +gfx950,256,1472,8192,1536,ck,13,1,32.3237,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1146.04,1205.34,0.0 +gfx950,256,1536,8192,1536,ck,13,3,32.6491,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1183.94,1228.46,0.0 +gfx950,256,1600,8192,1536,ck,14,3,38.1646,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1055.04,1080.97,0.0002 +gfx950,256,1664,8192,1536,ck,14,3,38.4425,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1089.31,1102.99,0.0002 +gfx950,256,1728,8192,1536,ck,14,3,38.6445,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1125.3,1126.9,0.0002 +gfx950,256,1792,8192,1536,ck,14,1,39.7512,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1134.49,1124.38,0.0002 +gfx950,256,1856,8192,1536,ck,13,2,41.175,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1134.37,1113.36,0.0 +gfx950,256,1920,8192,1536,ck,13,2,41.7573,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1157.12,1125.3,0.0 +gfx950,256,1984,8192,1536,ck,13,2,42.9929,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1161.33,1119.63,0.0 +gfx950,256,2048,8192,1536,ck,13,3,44.712,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1152.7,1102.23,0.0 +gfx950,256,4096,8192,1536,ck,13,1,79.2213,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1301.16,1085.35,0.0 +gfx950,256,8192,8192,1536,ck,13,2,146.3103,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1409.05,1089.35,0.0 +gfx950,256,16384,8192,1536,ck,13,3,271.6202,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1517.99,1127.25,0.0 +gfx950,256,32768,8192,1536,ck,13,1,513.6065,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1605.57,1167.79,0.0 +gfx950,256,16,11264,1536,ck,7,1,6.7619,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,81.88,2615.62,0.0 +gfx950,256,32,11264,1536,ck,6,1,7.096,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,156.05,2546.72,0.0 +gfx950,256,48,11264,1536,ck,7,0,7.5363,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,220.39,2449.02,0.0 +gfx950,256,64,11264,1536,ck,17,2,8.3027,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,266.73,2269.33,0.0 +gfx950,256,80,11264,1536,asm,8,1,9.794,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,282.65,1963.1,0.0 +gfx950,256,96,11264,1536,asm,8,1,9.9361,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_48x128E,334.33,1973.78,0.0 +gfx950,256,112,11264,1536,ck,12,3,10.4055,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,372.45,1921.74,0.0 +gfx950,256,128,11264,1536,ck,10,2,10.6475,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,415.98,1914.22,0.0227 +gfx950,256,144,11264,1536,ck,9,2,11.863,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,420.03,1750.55,0.0 +gfx950,256,160,11264,1536,ck,9,1,11.9836,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,462.0,1765.06,0.0 +gfx950,256,176,11264,1536,ck,15,2,12.9368,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,470.76,1664.77,0.0091 +gfx950,256,192,11264,1536,asm,32,1,12.7768,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_96x128E,519.99,1715.75,0.0 +gfx950,256,208,11264,1536,ck,15,3,13.6827,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,526.02,1630.29,0.0112 +gfx950,256,224,11264,1536,ck,15,1,13.86,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,559.24,1637.22,0.0122 +gfx950,256,240,11264,1536,ck,15,1,14.1955,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,585.02,1625.65,0.0107 +gfx950,256,256,11264,1536,ck,15,3,14.3361,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,617.91,1636.56,0.0013 +gfx950,256,288,11264,1536,ck,14,3,15.2132,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,655.07,1592.82,0.0 +gfx950,256,320,11264,1536,ck,14,0,15.3302,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,722.3,1630.9,0.0 +gfx950,256,352,11264,1536,ck,8,2,17.669,a8w8_blockscale_bpreshuffle_1x128x128_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,689.36,1458.6,0.0 +gfx950,256,384,11264,1536,ck,14,2,18.7495,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,708.69,1415.62,0.0 +gfx950,256,416,11264,1536,ck,14,0,19.4889,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,738.62,1401.42,0.0001 +gfx950,256,448,11264,1536,ck,14,2,19.4734,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,796.07,1442.08,0.0001 +gfx950,256,480,11264,1536,ck,14,1,19.9547,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,832.36,1445.89,0.0001 +gfx950,256,512,11264,1536,ck,14,1,19.8498,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,892.54,1492.32,0.0001 +gfx950,256,544,11264,1536,ck,13,0,22.6047,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,832.75,1344.51,0.0 +gfx950,256,576,11264,1536,ck,13,2,22.3751,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,890.78,1392.73,0.0 +gfx950,256,608,11264,1536,ck,13,0,23.0538,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,912.59,1385.13,0.0 +gfx950,256,640,11264,1536,ck,13,2,23.2775,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,951.39,1404.9,0.0 +gfx950,256,672,11264,1536,ck,13,2,23.5796,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,986.16,1419.55,0.0 +gfx950,256,704,11264,1536,ck,13,1,23.4414,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1039.21,1460.77,0.0 +gfx950,256,736,11264,1536,ck,14,3,28.0696,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,907.31,1247.35,0.0002 +gfx950,256,768,11264,1536,ck,14,2,27.9392,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,951.18,1280.73,0.0002 +gfx950,256,800,11264,1536,ck,14,0,28.7283,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,963.59,1272.36,0.0002 +gfx950,256,832,11264,1536,ck,14,3,29.2751,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,983.42,1274.9,0.0002 +gfx950,256,864,11264,1536,ck,14,0,29.5972,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1010.13,1287.04,0.0002 +gfx950,256,896,11264,1536,ck,14,1,29.5197,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1050.29,1316.51,0.0002 +gfx950,256,928,11264,1536,ck,13,1,31.8538,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1008.09,1244.21,0.0 +gfx950,256,960,11264,1536,ck,13,3,31.3254,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1060.45,1289.78,0.0 +gfx950,256,992,11264,1536,ck,13,0,35.4472,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,968.38,1161.53,0.0 +gfx950,256,1024,11264,1536,ck,13,0,35.0248,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1011.67,1197.52,0.0 +gfx950,256,1088,11264,1536,ck,13,0,33.7656,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1114.98,1287.79,0.0 +gfx950,256,1152,11264,1536,ck,14,2,36.3374,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1097.01,1239.03,0.0002 +gfx950,256,1216,11264,1536,ck,14,0,38.9925,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1079.11,1194.16,0.0003 +gfx950,256,1280,11264,1536,ck,13,1,39.6937,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1115.84,1211.87,0.0 +gfx950,256,1344,11264,1536,ck,13,1,42.2937,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1099.61,1173.78,0.0 +gfx950,256,1408,11264,1536,ck,13,3,41.3354,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1178.68,1238.25,0.0 +gfx950,256,1472,11264,1536,ck,13,3,41.1898,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1236.61,1280.02,0.0 +gfx950,256,1536,11264,1536,ck,13,3,46.8067,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1135.53,1159.32,0.0 +gfx950,256,1600,11264,1536,ck,13,2,48.4414,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1142.92,1151.99,0.0 +gfx950,256,1664,11264,1536,ck,13,2,48.8634,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1178.37,1173.56,0.0 +gfx950,256,1728,11264,1536,ck,13,3,48.7658,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1226.15,1207.49,0.0 +gfx950,256,1792,11264,1536,ck,13,0,49.6094,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1249.94,1218.0,0.0 +gfx950,256,1856,11264,1536,ck,13,1,52.3927,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1225.8,1182.69,0.0 +gfx950,256,1920,11264,1536,ck,13,3,58.0171,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1145.14,1094.58,0.0 +gfx950,256,1984,11264,1536,ck,13,1,58.9391,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1164.8,1103.59,0.0 +gfx950,256,2048,11264,1536,ck,13,1,61.3891,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1154.39,1084.63,0.0 +gfx950,256,4096,11264,1536,ck,13,1,104.1395,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1361.0,1112.62,0.0 +gfx950,256,8192,11264,1536,ck,13,1,191.8263,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1477.73,1117.85,0.0 +gfx950,256,16384,11264,1536,ck,13,1,360.1565,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1574.14,1142.74,0.0 +gfx950,256,32768,11264,1536,ck,13,2,679.5554,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1668.55,1185.82,0.0 diff --git a/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_qwen3.5_397b.csv b/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_qwen3.5_397b.csv index 3cd811ff18..649677188d 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_qwen3.5_397b.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_qwen3.5_397b.csv @@ -1,554 +1,554 @@ -cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,1,16,4096,cktile,12,0,9.9757,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,0.01,6.98,0.0 -256,2,16,4096,cktile,12,0,13.9708,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,0.02,5.28,0.0 -256,4,16,4096,cktile,0,0,14.2842,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,0.04,5.74,0.0 -256,8,16,4096,cktile,2,0,14.5794,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,0.07,6.76,0.0 -256,16,16,4096,cktile,12,0,15.0983,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,0.14,8.72,0.0 -256,32,16,4096,cktile,2,0,15.196,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,0.28,13.01,0.0 -256,64,16,4096,cktile,2,0,15.3293,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,0.55,21.51,0.0 -256,128,16,4096,cktile,2,0,15.5018,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,1.08,38.31,0.0 -256,256,16,4096,cktile,0,0,15.5628,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,2.16,72.11,0.0 -256,512,16,4096,cktile,14,0,16.0332,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_3,4.19,135.91,0.0 -256,1024,16,4096,cktile,2,0,17.2261,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,7.79,249.19,0.0 -256,2048,16,4096,cktile,2,0,17.6907,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,15.17,481.59,0.0 -256,4096,16,4096,cktile,2,0,17.9746,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,29.87,944.32,0.0 -256,8192,16,4096,cktile,1,0,25.2013,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,42.61,1344.46,0.0 -256,16384,16,4096,cktile,7,0,37.8328,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,56.76,1789.42,0.0 -256,32768,16,4096,cktile,19,0,55.9249,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_4,76.8,2419.89,0.0 -256,1,32,4096,cktile,12,0,9.9923,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,0.03,13.53,0.0 -256,2,32,4096,cktile,1,0,13.9482,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,0.04,9.99,0.0 -256,4,32,4096,cktile,2,0,14.2116,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,0.07,10.39,0.0 -256,8,32,4096,cktile,1,0,14.576,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,0.14,11.28,0.0 -256,16,32,4096,cktile,1,0,15.1038,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,0.28,13.08,0.0 -256,32,32,4096,cktile,12,0,15.3304,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,0.55,17.23,0.0 -256,64,32,4096,cktile,1,0,15.251,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,1.1,26.05,0.0 -256,128,32,4096,cktile,12,0,15.6217,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,2.15,42.48,0.0 -256,256,32,4096,cktile,0,0,15.6541,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,4.29,76.4,0.0 -256,512,32,4096,cktile,12,0,16.3069,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,8.23,138.65,0.0 -256,1024,32,4096,cktile,2,0,17.4819,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,15.36,251.17,0.0 -256,2048,32,4096,cktile,1,0,17.7872,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,30.18,486.35,0.0 -256,4096,32,4096,cktile,0,0,18.2014,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,58.99,943.36,0.0 -256,8192,32,4096,cktile,2,0,25.1674,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,85.33,1359.29,0.0 -256,16384,32,4096,cktile,7,0,38.4387,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,111.74,1776.56,0.0 -256,32768,32,4096,cktile,18,0,56.1714,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_3,152.92,2429.1,0.0 -256,1,256,512,ck,7,0,2.6477,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.1,49.89,0.0 -256,2,256,512,ck,7,0,2.6554,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.2,50.13,0.0 -256,4,256,512,ck,7,0,2.4076,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.44,56.14,0.0 -256,8,256,512,ck,7,0,2.6846,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.78,51.88,0.0 -256,16,256,512,ck,7,0,2.7194,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.54,54.22,0.0 -256,32,256,512,ck,12,0,2.9842,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,2.81,54.9,0.0 -256,64,256,512,ck,6,0,3.6491,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,4.6,53.88,0.0 -256,128,256,512,ck,7,0,3.2278,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.4,81.21,0.0 -256,256,256,512,ck,7,0,3.2341,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.75,121.58,0.0 -256,512,256,512,ck,7,0,3.2978,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.7,198.73,0.0 -256,1024,256,512,ck,12,0,3.9174,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,68.52,301.13,0.0 -256,2048,256,512,ck,6,0,3.9969,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,134.32,557.49,0.0 -256,4096,256,512,ck,12,0,5.1797,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,207.3,835.06,0.0 -256,8192,256,512,ck,9,0,6.6634,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,322.28,1278.58,0.0 -256,16384,256,512,ck,1,0,10.5907,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,405.54,1596.52,0.0 -256,32768,256,512,ck,16,0,16.1398,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,532.22,2087.11,0.0 -256,1,512,256,ck,5,0,2.7069,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0.1,48.89,0.0 -256,1,512,512,ck,7,0,2.9061,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.18,90.73,0.0 -256,1,512,1024,ck,6,0,4.124,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.25,127.63,0.0 -256,1,512,2048,ck,6,0,5.2158,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.4,201.63,0.0 -256,2,512,256,ck,6,0,2.5406,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.21,52.6,0.0 -256,2,512,512,ck,6,0,2.7919,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.38,94.99,0.0 -256,2,512,1024,ck,6,0,3.8596,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.54,136.9,0.0 -256,2,512,2048,ck,6,0,5.4904,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.76,192.1,0.0 -256,4,512,256,ck,4,0,2.6362,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,0.4,51.66,0.0 -256,4,512,512,ck,6,0,3.0436,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.69,88.15,0.0 -256,4,512,1024,ck,7,0,3.8855,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.08,137.04,0.0 -256,4,512,2048,ck,7,0,5.2943,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.58,200.38,0.0 -256,8,512,256,ck,6,0,2.5849,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.81,54.67,0.0 -256,8,512,512,ck,6,0,2.8378,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,1.48,96.71,0.0 -256,8,512,1024,ck,6,0,4.1566,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,2.02,130.08,0.0 -256,8,512,2048,ck,6,0,5.404,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,3.1,198.58,0.0 -256,16,512,256,ck,6,0,2.6211,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,1.6,57.82,0.0 -256,16,512,512,ck,6,0,2.8905,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,2.9,99.19,0.0 -256,16,512,1024,ck,7,0,3.9659,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.23,140.46,0.0 -256,16,512,2048,ck,7,0,5.2571,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.38,208.81,0.0 -256,32,512,256,ck,7,0,2.9498,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.84,58.32,0.0 -256,32,512,512,ck,7,0,3.2402,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.18,96.07,0.0 -256,32,512,1024,ck,7,0,4.1647,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.06,141.62,0.0 -256,32,512,2048,ck,7,0,5.4239,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.37,211.45,0.0 -256,64,512,256,ck,7,0,2.9421,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.7,72.39,0.0 -256,64,512,512,ck,7,0,3.2447,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.34,111.09,0.0 -256,64,512,1024,ck,12,0,4.617,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,14.54,141.94,0.0 -256,64,512,2048,ck,12,0,6.0064,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,22.35,207.31,0.0 -256,128,512,256,ck,10,0,3.388,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,9.9,87.05,0.0 -256,128,512,512,ck,7,0,3.2281,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.79,142.11,0.0 -256,128,512,1024,ck,6,0,4.7301,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,28.38,166.26,0.0 -256,128,512,2048,ck,7,0,5.4576,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.19,264.18,0.0 -256,256,512,256,ck,7,0,2.9734,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.57,154.29,0.0 -256,256,512,512,ck,7,0,3.2856,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.85,199.46,0.0 -256,256,512,1024,ck,7,0,4.2998,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,62.43,243.87,0.0 -256,256,512,2048,ck,12,0,6.0343,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,88.97,304.1,0.0 -256,512,512,256,ck,12,0,3.3898,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,39.59,232.0,0.0 -256,512,512,512,ck,6,0,3.8003,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,70.64,275.92,0.0 -256,512,512,1024,ck,12,0,4.7538,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,112.94,330.86,0.0 -256,512,512,2048,ck,7,0,6.4199,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,167.25,408.33,0.0 -256,1024,512,256,ck,12,0,3.4621,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,77.54,416.45,0.0 -256,1024,512,512,ck,6,0,3.9298,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,136.62,466.95,0.0 -256,1024,512,1024,ck,12,0,5.4144,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,198.31,484.16,0.0 -256,1024,512,2048,ck,12,0,6.9139,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,310.6,606.65,0.0 -256,2048,512,256,ck,4,0,4.3267,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,124.08,636.17,0.0 -256,2048,512,512,ck,11,0,4.9635,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,216.33,686.59,0.0 -256,2048,512,1024,ck,9,0,7.2831,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,294.86,647.88,0.0 -256,2048,512,2048,ck,11,0,10.3233,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,416.05,711.02,0.0 -256,4096,512,256,ck,12,0,5.7231,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,187.62,938.99,0.0 -256,4096,512,512,ck,16,0,6.0313,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,356.06,1086.6,0.0 -256,4096,512,1024,ck,14,0,9.5983,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,447.47,928.59,0.0 -256,4096,512,2048,ck,15,0,13.5435,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,634.25,1006.5,0.0009 -256,8192,512,256,ck,14,0,7.7636,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,276.61,1367.51,0.0 -256,8192,512,512,ck,1,0,10.1629,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,422.61,1263.92,0.0 -256,8192,512,1024,ck,14,0,13.1835,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,651.57,1312.36,0.0 -256,8192,512,2048,ck,14,0,18.8774,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,910.08,1388.67,0.0 -256,16384,512,256,ck,9,0,12.9271,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,332.25,1632.43,0.0 -256,16384,512,512,ck,16,0,16.0509,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,535.17,1584.21,0.0 -256,16384,512,1024,ck,13,0,20.9235,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,821.08,1628.73,0.0 -256,16384,512,2048,ck,13,0,29.9784,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1146.15,1713.91,0.0 -256,32768,512,256,ck,16,0,21.12,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,406.72,1992.15,0.0 -256,32768,512,512,ck,16,0,27.1113,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,633.68,1866.15,0.0 -256,32768,512,1024,ck,14,0,38.8109,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,885.31,1742.63,0.0004 -256,32768,512,2048,ck,13,0,56.8603,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1208.57,1788.8,0.0 -256,1,1024,256,ck,5,0,2.7529,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0.19,96.06,0.0 -256,1,1024,512,ck,7,0,2.9323,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.36,179.67,0.0 -256,1,1024,1024,cktile,1,0,4.3045,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,0.49,244.31,0.0 -256,1,1024,2048,ck,7,0,5.17,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.81,406.43,0.0 -256,1,1024,4096,ck,7,0,11.8308,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.71,355.04,0.0 -256,2,1024,256,ck,5,0,2.7842,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0.38,95.81,0.0 -256,2,1024,512,ck,7,0,2.9541,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.71,179.21,0.0 -256,2,1024,1024,ck,7,0,3.8918,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.08,271.01,0.0 -256,2,1024,2048,ck,7,0,5.2282,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.6,402.69,0.0 -256,2,1024,4096,ck,7,0,12.0383,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.39,349.43,0.0 -256,4,1024,256,ck,5,0,2.7982,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0.75,96.98,0.0 -256,4,1024,512,ck,7,0,3.5522,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.18,150.48,0.0 -256,4,1024,1024,ck,7,0,3.9714,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.11,267.13,0.0 -256,4,1024,2048,ck,7,0,5.2853,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.17,399.89,0.0 -256,4,1024,4096,ck,7,0,12.2115,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.75,345.48,0.0 -256,8,1024,256,ck,7,0,2.6714,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.57,105.03,0.0 -256,8,1024,512,ck,6,0,3.3901,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,2.47,160.69,0.0 -256,8,1024,1024,ck,6,0,4.5254,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,3.71,237.14,0.0 -256,8,1024,2048,ck,7,0,5.3151,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.31,400.73,0.0 -256,8,1024,4096,ck,7,0,11.9359,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.62,355.52,0.0 -256,16,1024,256,ck,7,0,2.7287,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.07,109.58,0.0 -256,16,1024,512,ck,7,0,3.0314,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.53,186.46,0.0 -256,16,1024,1024,ck,7,0,4.569,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.34,240.26,0.0 -256,16,1024,2048,ck,7,0,5.1776,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.96,417.7,0.0 -256,16,1024,4096,ck,7,0,11.4238,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.75,375.76,0.0 -256,32,1024,256,ck,7,0,3.0511,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.5,110.08,0.0 -256,32,1024,512,ck,7,0,3.2677,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.27,185.52,0.0 -256,32,1024,1024,ck,7,0,4.8239,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.91,237.75,0.0 -256,32,1024,2048,ck,7,0,5.3868,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,24.92,413.65,0.0 -256,32,1024,4096,ck,7,0,11.2193,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.93,391.37,0.0 -256,64,1024,256,ck,5,0,3.0742,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,10.91,133.24,0.0 -256,64,1024,512,ck,6,0,3.6706,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,18.28,187.47,0.0 -256,64,1024,1024,ck,7,0,4.2391,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,31.66,293.74,0.0 -256,64,1024,2048,ck,7,0,5.3999,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.71,436.91,0.0 -256,64,1024,4096,ck,7,0,11.0075,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,48.77,416.76,0.0 -256,128,1024,256,ck,7,0,2.9858,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.48,186.57,0.0 -256,128,1024,512,ck,7,0,3.3077,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.58,257.57,0.0 -256,128,1024,1024,ck,7,0,4.2871,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,62.61,336.31,0.0 -256,128,1024,2048,ck,7,0,5.8742,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,91.39,446.26,0.0 -256,128,1024,4096,asm,5,6,7.7633,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,138.31,641.57,0.0001 -256,256,1024,256,ck,7,0,3.0616,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.84,278.28,0.0 -256,256,1024,512,ck,7,0,3.4184,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,78.53,345.09,0.0 -256,256,1024,1024,ck,7,0,4.5507,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,117.98,403.24,0.0 -256,256,1024,2048,ck,7,0,6.8139,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,157.58,461.66,0.0 -256,256,1024,4096,ck,7,0,11.2167,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,191.45,514.16,0.0 -256,512,1024,256,ck,12,0,3.4576,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,77.64,416.99,0.0 -256,512,1024,512,ck,7,0,3.9003,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,137.65,470.48,0.0 -256,512,1024,1024,ck,12,0,5.1465,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,208.64,509.36,0.0 -256,512,1024,2048,ck,12,0,7.3305,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,292.95,572.17,0.0 -256,512,1024,4096,ck,12,0,11.9299,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,360.02,615.26,0.0 -256,1024,1024,256,ck,12,0,4.2083,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,127.57,622.92,0.0 -256,1024,1024,512,ck,11,0,4.9934,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,215.03,629.98,0.0 -256,1024,1024,1024,ck,17,0,6.6739,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,321.77,628.46,0.0 -256,1024,1024,2048,ck,17,0,10.2081,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,420.74,616.32,0.0 -256,1024,1024,4096,ck,17,0,14.2436,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,603.07,736.17,0.0 -256,2048,1024,256,ck,17,0,5.7494,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,186.76,866.31,0.0 -256,2048,1024,512,ck,14,0,6.599,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,325.43,873.95,0.0 -256,2048,1024,1024,ck,17,0,9.7776,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,439.27,750.7,0.0 -256,2048,1024,2048,ck,17,0,13.7534,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,624.57,762.41,0.0 -256,2048,1024,4096,ck,17,0,21.6011,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,795.32,776.68,0.0 -256,4096,1024,256,ck,8,0,7.9335,a8w8_blockscale_bpreshuffle_1x128x128_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,270.69,1222.58,0.0 -256,4096,1024,512,ck,16,0,10.3882,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,413.45,1059.86,0.0 -256,4096,1024,1024,ck,14,0,13.1991,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,650.8,1032.76,0.0 -256,4096,1024,2048,ck,14,0,19.096,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,899.66,988.39,0.0001 -256,4096,1024,4096,ck,14,0,30.6367,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1121.52,958.33,0.0002 -256,8192,1024,256,ck,9,0,12.8029,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,335.47,1494.7,0.0 -256,8192,1024,512,ck,11,0,15.7794,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,544.38,1362.27,0.0 -256,8192,1024,1024,ck,13,0,20.8729,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,823.07,1255.91,0.0 -256,8192,1024,2048,ck,13,0,29.8301,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1151.85,1195.15,0.0 -256,8192,1024,4096,ck,13,0,50.1691,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1369.76,1086.84,0.0 -256,16384,1024,256,ck,16,0,21.154,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,406.07,1796.86,0.0 -256,16384,1024,512,ck,16,0,25.8117,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,665.58,1645.27,0.0 -256,16384,1024,1024,ck,14,0,38.251,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,898.27,1343.24,0.0003 -256,16384,1024,2048,ck,13,0,55.8164,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1231.17,1239.89,0.0 -256,16384,1024,4096,ck,13,0,92.3826,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1487.71,1135.04,0.0 -256,32768,1024,256,ck,14,0,36.5227,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,470.39,2074.32,0.0 -256,32768,1024,512,ck,16,0,46.8431,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,733.51,1801.98,0.0 -256,32768,1024,1024,ck,13,0,70.7661,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,971.08,1437.3,0.0 -256,32768,1024,2048,ck,13,0,103.0712,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1333.44,1322.53,0.0 -256,32768,1024,4096,ck,13,0,172.4385,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1594.06,1191.85,0.0 -256,1,1152,2048,ck,6,0,5.7928,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.81,408.03,0.0 -256,2,1152,2048,ck,7,0,5.8106,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.62,407.53,0.0 -256,4,1152,2048,ck,7,0,5.8616,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.22,405.47,0.0 -256,8,1152,2048,ck,7,0,5.9408,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.35,402.99,0.0 -256,16,1152,2048,ck,7,0,5.5125,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.7,440.62,0.0 -256,32,1152,2048,ck,7,0,5.4689,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,27.61,456.87,0.0 -256,64,1152,2048,ck,7,0,6.1787,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,48.88,426.92,0.0 -256,128,1152,2048,ck,7,0,6.7713,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,89.2,430.69,0.0 -256,256,1152,2048,ck,7,0,6.8666,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,175.92,505.84,0.0 -256,512,1152,2048,ck,12,0,8.3546,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,289.17,549.1,0.0 -256,1024,1152,2048,ck,17,0,12.1623,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,397.28,560.4,0.0 -256,2048,1152,2048,ck,15,0,16.8691,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,572.86,668.22,0.0011 -256,4096,1152,2048,ck,14,0,23.5712,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,819.96,856.35,0.0002 -256,8192,1152,2048,ck,14,0,38.0618,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1015.58,998.66,0.0003 -256,16384,1152,2048,ck,14,0,67.0848,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1152.41,1098.05,0.0004 -256,32768,1152,2048,ck,14,0,122.1313,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1266.0,1186.97,0.0004 -256,1,1536,2048,ck,6,0,7.6446,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.82,412.17,0.0 -256,2,1536,2048,ck,7,0,7.5211,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.67,419.62,0.0 -256,4,1536,2048,ck,7,0,7.258,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.47,436.24,0.0 -256,8,1536,2048,ck,7,0,7.2516,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.94,439.45,0.0 -256,16,1536,2048,ck,7,0,7.5651,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.31,426.65,0.0 -256,32,1536,2048,ck,12,0,7.6017,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,26.48,435.37,0.0 -256,64,1536,2048,ck,7,0,7.3449,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.82,472.9,0.0 -256,128,1536,2048,ck,7,0,7.0358,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,114.46,540.25,0.0 -256,256,1536,2048,ck,12,0,6.8003,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,236.84,655.33,0.0 -256,512,1536,2048,ck,12,0,8.9195,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,361.14,646.58,0.0 -256,1024,1536,2048,ck,17,0,12.7918,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,503.64,655.78,0.0 -256,2048,1536,2048,ck,14,0,18.0163,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,715.18,756.62,0.0001 -256,4096,1536,2048,ck,14,0,25.2772,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1019.49,954.11,0.0003 -256,8192,1536,2048,ck,13,0,44.5696,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1156.38,1011.65,0.0 -256,16384,1536,2048,ck,13,0,78.1257,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1319.4,1114.0,0.0 -256,32768,1536,2048,ck,13,0,141.9473,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1452.36,1204.09,0.0 -256,1,2048,512,cktile,15,0,3.5207,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_4,0.6,299.14,0.0 -256,1,2048,1024,ck,7,0,3.8919,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.08,540.17,0.0 -256,1,2048,2048,ck,7,0,7.1436,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.17,588.0,0.0 -256,1,2048,4096,ck,7,0,12.1487,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.38,691.17,0.0 -256,2,2048,512,ck,7,0,2.9658,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.41,356.66,0.0 -256,2,2048,1024,ck,7,0,3.9292,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.13,536.34,0.0 -256,2,2048,2048,ck,7,0,7.3283,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.29,574.02,0.0 -256,2,2048,4096,ck,7,0,12.2679,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.74,685.12,0.0 -256,4,2048,512,ck,7,0,2.9866,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.81,357.27,0.0 -256,4,2048,1024,ck,6,0,4.4419,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,3.78,476.74,0.0 -256,4,2048,2048,ck,7,0,7.2284,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.64,583.65,0.0 -256,4,2048,4096,ck,7,0,12.1475,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.52,693.26,0.0 -256,8,2048,512,ck,7,0,3.0406,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.52,356.98,0.0 -256,8,2048,1024,ck,7,0,3.964,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.46,539.38,0.0 -256,8,2048,2048,ck,7,0,7.2237,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.29,587.44,0.0 -256,8,2048,4096,ck,7,0,12.0807,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.11,699.81,0.0 -256,16,2048,512,ck,7,0,3.0452,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.02,368.55,0.0 -256,16,2048,1024,ck,7,0,3.9842,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.84,546.93,0.0 -256,16,2048,2048,ck,7,0,6.856,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.58,626.11,0.0 -256,16,2048,4096,ck,7,0,11.2919,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.77,754.49,0.0 -256,32,2048,512,ck,7,0,3.2685,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.53,365.93,0.0 -256,32,2048,1024,ck,7,0,4.2361,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,31.68,533.74,0.0 -256,32,2048,2048,ck,7,0,7.1317,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,37.64,615.69,0.0 -256,32,2048,4096,ck,7,0,11.4391,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.93,756.24,0.0 -256,64,2048,512,ck,7,0,3.3002,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.67,407.09,0.0 -256,64,2048,1024,ck,7,0,4.3176,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,62.17,561.62,0.0 -256,64,2048,2048,ck,7,0,7.1316,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,75.28,643.27,0.0 -256,64,2048,4096,ck,7,0,11.2775,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,95.21,790.33,0.0 -256,128,2048,512,ck,7,0,3.3974,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,79.01,482.25,0.0 -256,128,2048,1024,ck,7,0,5.065,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,106.0,543.44,0.0 -256,128,2048,2048,ck,7,0,7.0523,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,152.25,706.26,0.0 -256,128,2048,4096,ck,7,0,11.2291,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,191.24,840.42,0.0 -256,256,2048,512,ck,12,0,3.9772,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,134.99,560.25,0.0 -256,256,2048,1024,ck,12,0,5.3818,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,199.51,633.22,0.0 -256,256,2048,2048,ck,12,0,7.5893,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,282.96,759.91,0.0 -256,256,2048,4096,ck,12,0,12.1126,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,354.59,865.69,0.0 -256,512,2048,512,ck,11,0,5.0404,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,213.03,676.11,0.0 -256,512,2048,1024,ck,11,0,6.3963,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,335.74,737.71,0.0 -256,512,2048,2048,ck,17,0,9.7291,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,441.46,754.44,0.0 -256,512,2048,4096,ck,17,0,13.9323,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,616.55,903.15,0.0 -256,1024,2048,512,ck,17,0,6.2043,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,346.13,929.54,0.0 -256,1024,2048,1024,ck,15,0,9.4687,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,453.6,775.19,0.0005 -256,1024,2048,2048,ck,17,0,13.1692,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,652.27,796.23,0.0 -256,1024,2048,4096,ck,17,0,21.7907,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,788.4,769.93,0.0 -256,2048,2048,512,ck,17,0,10.3309,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,415.74,1014.99,0.0 -256,2048,2048,1024,ck,14,0,13.6276,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,630.33,923.34,0.0 -256,2048,2048,2048,ck,14,0,18.8518,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,911.31,889.95,0.0002 -256,2048,2048,4096,ck,14,0,30.6689,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1120.34,820.56,0.0002 -256,4096,2048,512,ck,16,0,15.7595,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,545.06,1264.19,0.0 -256,4096,2048,1024,ck,13,0,20.6804,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,830.73,1115.48,0.0 -256,4096,2048,2048,ck,13,0,29.3797,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1169.51,999.33,0.0 -256,4096,2048,4096,ck,13,0,49.1486,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1398.2,853.39,0.0 -256,8192,2048,512,ck,16,0,25.8314,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,665.08,1501.94,0.0 -256,8192,2048,1024,ck,13,0,37.8569,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,907.62,1163.33,0.0 -256,8192,2048,2048,ck,13,0,54.7439,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1255.29,996.02,0.0 -256,8192,2048,4096,ck,13,0,91.6649,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1499.36,823.62,0.0 -256,16384,2048,512,ck,16,0,45.9039,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,748.51,1667.53,0.0 -256,16384,2048,1024,ck,13,0,67.9688,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1011.04,1265.04,0.0 -256,16384,2048,2048,ck,13,0,100.8975,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1362.16,1039.25,0.0 -256,16384,2048,4096,ck,13,0,172.1554,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1596.68,828.36,0.0 -256,32768,2048,512,ck,16,0,85.8103,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,800.83,1771.86,0.0 -256,32768,2048,1024,ck,13,0,131.0786,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1048.52,1295.93,0.0 -256,32768,2048,2048,ck,13,0,195.3896,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1406.82,1051.85,0.0 -256,32768,2048,4096,ck,13,0,336.668,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1632.93,822.25,0.0 -256,1,2176,4096,ck,7,0,12.8835,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.38,692.46,0.0 -256,2,2176,4096,ck,7,0,12.2827,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.9,727.02,0.0 -256,4,2176,4096,ck,7,0,12.2395,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.83,730.97,0.0 -256,8,2176,4096,ck,7,0,12.4327,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.47,722.33,0.0 -256,16,2176,4096,ck,7,0,11.3777,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.07,795.25,0.0 -256,32,2176,4096,ck,7,0,11.4366,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.88,802.97,0.0 -256,64,2176,4096,ck,7,0,11.3644,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,100.39,831.86,0.0 -256,128,2176,4096,ck,7,0,11.9866,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,190.35,833.78,0.0 -256,256,2176,4096,ck,17,0,13.8813,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,328.74,797.88,0.0 -256,512,2176,4096,ck,15,0,19.7041,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,463.19,671.85,0.0002 -256,1024,2176,4096,ck,14,0,26.8779,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,679.13,653.46,0.0 -256,2048,2176,4096,ck,14,0,39.7142,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,919.25,660.08,0.0002 -256,4096,2176,4096,ck,14,0,63.494,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1149.94,685.35,0.0003 -256,8192,2176,4096,ck,14,0,111.4756,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1309.96,700.77,0.0003 -256,16384,2176,4096,ck,0,0,200.586,a8w8_blockscale_bpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,1456.02,734.47,0.0 -256,32768,2176,4096,ck,0,0,361.4039,a8w8_blockscale_bpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,1616.24,790.63,0.0 -256,1,2304,2048,ck,7,0,7.0303,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.34,672.13,0.0 -256,2,2304,2048,ck,7,0,7.4911,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.52,631.67,0.0 -256,4,2304,2048,ck,7,0,7.7175,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.89,614.86,0.0 -256,8,2304,2048,ck,7,0,7.2733,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.38,656.08,0.0 -256,16,2304,2048,ck,7,0,6.7265,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.45,717.33,0.0 -256,32,2304,2048,ck,7,0,6.6844,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,45.18,737.78,0.0 -256,64,2304,2048,ck,7,0,6.8275,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,88.46,753.51,0.0 -256,128,2304,2048,ck,7,0,7.3232,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,164.95,760.67,0.0 -256,256,2304,2048,ck,17,0,9.3755,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,257.68,685.03,0.0 -256,512,2304,2048,ck,12,0,12.0763,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,400.11,672.93,0.0 -256,1024,2304,2048,ck,2,0,16.7332,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,577.52,689.31,0.0 -256,2048,2304,2048,ck,14,0,23.6368,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,817.68,776.34,0.0001 -256,4096,2304,2048,cktile,26,0,38.0636,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1015.53,840.21,0.0 -256,8192,2304,2048,ck,13,0,64.5936,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1196.86,917.19,0.0 -256,16384,2304,2048,ck,13,0,113.7928,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1358.78,999.8,0.0 -256,32768,2304,2048,ck,13,0,210.3495,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1470.11,1059.3,0.0 -256,1,2560,4096,ck,7,0,12.3531,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.7,849.58,0.0 -256,2,2560,4096,ck,7,0,12.5047,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.35,840.02,0.0 -256,4,2560,4096,ck,7,0,12.124,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.92,867.92,0.0 -256,8,2560,4096,ck,7,0,12.1307,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.83,870.48,0.0 -256,16,2560,4096,ck,7,0,12.0651,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,27.81,881.32,0.0 -256,32,2560,4096,ck,7,0,11.1263,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,60.32,968.94,0.0 -256,64,2560,4096,ck,7,0,11.6227,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,115.48,952.93,0.0 -256,128,2560,4096,ck,7,0,12.3257,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,217.79,946.43,0.0 -256,256,2560,4096,ck,17,0,13.8476,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,387.7,927.6,0.0 -256,512,2560,4096,ck,17,0,20.1587,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,532.64,754.23,0.0 -256,1024,2560,4096,ck,14,0,27.5173,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,780.41,724.02,0.0001 -256,2048,2560,4096,ck,14,0,41.4725,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1035.62,707.94,0.0002 -256,4096,2560,4096,cktile,28,0,59.7488,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1437.67,807.29,0.0 -256,8192,2560,4096,ck,13,0,114.6292,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1498.73,750.1,0.0 -256,16384,2560,4096,ck,13,0,207.1913,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1658.36,779.38,0.0 -256,32768,2560,4096,ck,13,0,392.3196,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1751.62,796.48,0.0 -256,1,3072,2048,ck,7,0,7.2249,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.74,871.94,0.0 -256,2,3072,2048,ck,7,0,7.4827,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.36,842.99,0.0 -256,4,3072,2048,ck,7,0,7.7519,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.49,815.83,0.0 -256,8,3072,2048,ck,7,0,7.6378,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.18,832.31,0.0 -256,16,3072,2048,ck,7,0,6.8232,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,29.51,941.28,0.0 -256,32,3072,2048,ck,7,0,7.4069,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.36,884.8,0.0 -256,64,3072,2048,ck,7,0,7.9281,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,101.58,859.69,0.0 -256,128,3072,2048,ck,7,0,7.9906,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,201.56,918.58,0.0 -256,256,3072,2048,ck,11,0,9.3418,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,344.82,897.96,0.0 -256,512,3072,2048,ck,12,0,12.7345,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,505.91,823.41,0.0 -256,1024,3072,2048,ck,14,0,17.6286,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,730.91,832.74,0.0001 -256,2048,3072,2048,ck,14,0,25.6421,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1004.98,899.64,0.0003 -256,4096,3072,2048,ck,13,0,41.8638,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1231.13,951.8,0.0 -256,8192,3072,2048,ck,13,0,76.4197,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1348.86,960.49,0.0 -256,16384,3072,2048,ck,13,0,143.5179,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1436.46,979.04,0.0 -256,32768,3072,2048,ck,13,0,264.8182,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1556.98,1037.42,0.0 -256,1,4096,2048,ck,7,0,7.6088,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.2,1103.83,0.0 -256,2,4096,2048,ck,7,0,6.964,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.82,1207.51,0.0 -256,4,4096,1024,ck,7,0,5.181,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.48,816.67,0.0 -256,4,4096,2048,ck,7,0,7.3066,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.18,1153.69,0.0 -256,8,4096,2048,ck,7,0,7.1699,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.72,1181.4,0.0 -256,16,4096,1024,ck,7,0,5.1635,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.99,840.86,0.0 -256,16,4096,2048,ck,7,0,7.2391,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,37.08,1181.42,0.0 -256,32,4096,2048,ck,7,0,7.7878,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,68.94,1119.22,0.0 -256,64,4096,2048,ck,7,0,7.1537,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,150.1,1264.24,0.0 -256,128,4096,1024,ck,7,0,5.6283,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,190.78,954.81,0.0 -256,128,4096,2048,ck,7,0,7.6378,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,281.17,1269.91,0.0 -256,256,4096,1024,ck,12,0,6.7543,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,317.94,970.29,0.0 -256,256,4096,2048,ck,11,0,9.726,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,441.6,1132.02,0.0 -256,512,4096,2048,ck,17,0,13.313,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,645.23,1023.92,0.0 -256,1024,4096,1024,ck,14,0,12.7825,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,672.01,1066.42,0.0 -256,1024,4096,2048,ck,14,0,18.6898,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,919.21,1009.88,0.0002 -256,2048,4096,1024,ck,13,0,21.1184,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,813.5,1092.35,0.0 -256,2048,4096,2048,ck,13,0,29.6053,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1160.59,991.72,0.0 -256,4096,4096,1024,ck,13,0,38.902,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,883.24,1078.17,0.0 -256,4096,4096,2048,ck,13,0,54.8859,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1252.04,917.02,0.0 -256,8192,4096,1024,ck,14,0,69.6632,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,986.45,1143.96,0.0003 -256,8192,4096,2048,ck,13,0,100.0583,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1373.59,922.21,0.0 -256,16384,4096,1024,ck,13,0,131.4292,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1045.73,1180.78,0.0 -256,16384,4096,2048,ck,13,0,187.6366,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1464.95,938.84,0.0 -256,32768,4096,1024,ck,14,0,238.3201,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1153.4,1284.76,0.0004 -256,32768,4096,2048,ck,13,1,345.9207,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1589.25,994.25,0.0 -256,1,4352,4096,ck,7,0,12.4593,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.86,1431.75,0.0 -256,2,4352,4096,ck,7,0,12.4143,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.74,1437.97,0.0 -256,4,4352,4096,ck,7,0,12.3497,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.55,1447.56,0.0 -256,8,4352,4096,ck,7,0,12.4303,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.94,1442.3,0.0 -256,16,4352,4096,ck,7,0,11.3916,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.07,1582.8,0.0 -256,32,4352,4096,ck,7,0,11.4221,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,99.88,1596.5,0.0 -256,64,4352,4096,ck,7,0,11.9047,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,191.66,1566.19,0.0 -256,128,4352,4096,ck,17,0,14.242,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,320.42,1366.68,0.0 -256,256,4352,4096,ck,17,0,20.0387,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,455.46,1053.09,0.0 -256,512,4352,4096,ck,14,0,26.8411,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,680.06,908.29,0.0 -256,1024,4352,4096,ck,14,0,40.6091,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,898.99,761.73,0.0001 -256,2048,4352,4096,cktile,26,0,58.3217,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1251.93,755.13,0.0 -256,4096,4352,4096,ck,13,0,106.0856,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1376.52,662.24,0.0 -256,8192,4352,4096,cktile,28,0,175.3808,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1665.28,699.53,0.0 -256,16384,4352,4096,ck,13,0,337.0876,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1732.83,675.02,0.0 -256,32768,4352,4096,cktile,28,0,623.3898,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1874.0,701.42,0.0 -256,1,4608,2048,ck,7,0,7.2729,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.6,1299.13,0.0 -256,1,4608,4096,ck,7,0,12.6298,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.99,1495.49,0.0 -256,2,4608,2048,ck,7,0,7.1868,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.25,1316.26,0.0 -256,2,4608,4096,ck,7,0,12.6641,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.96,1492.49,0.0 -256,4,4608,2048,ck,7,0,7.3386,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.29,1292.1,0.0 -256,4,4608,4096,ck,7,0,12.6152,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.97,1500.38,0.0 -256,8,4608,2048,ck,7,0,8.0515,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.75,1183.29,0.0 -256,8,4608,4096,ck,7,0,12.5023,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,24.15,1518.19,0.0 -256,16,4608,2048,ck,7,0,7.28,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.48,1321.07,0.0 -256,32,4608,2048,ck,7,0,7.7986,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,77.45,1256.33,0.0 -256,32,4608,4096,ck,7,0,11.6068,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,104.07,1662.85,0.0 -256,64,4608,2048,ck,6,0,8.0492,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,150.07,1262.0,0.0 -256,64,4608,4096,ck,12,0,12.3488,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,195.64,1597.43,0.0 -256,128,4608,2048,ck,12,0,9.9319,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,243.25,1095.36,0.0 -256,128,4608,4096,ck,17,0,14.28,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,338.36,1441.06,0.0 -256,256,4608,2048,ck,10,0,11.8567,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,407.52,1039.14,0.0214 -256,256,4608,4096,ck,10,0,20.4557,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,472.42,1089.29,0.0198 -256,512,4608,2048,ck,2,0,16.4601,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,587.1,923.71,0.0 -256,512,4608,4096,ck,2,0,27.5561,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,701.38,932.28,0.0 -256,1024,4608,2048,ck,14,0,23.4479,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,824.27,894.39,0.0002 -256,1024,4608,4096,ck,14,0,40.7002,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,949.74,798.67,0.0001 -256,2048,4608,2048,cktile,28,0,37.2525,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1037.64,872.58,0.0 -256,2048,4608,4096,cktile,26,0,57.5625,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1343.05,801.52,0.0 -256,4096,4608,2048,ck,13,0,63.8104,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1211.55,870.93,0.0 -256,4096,4608,4096,ck,13,0,107.8779,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1433.28,680.4,0.0 -256,8192,4608,2048,ck,13,0,112.5915,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1373.27,903.37,0.0 -256,8192,4608,4096,cktile,11,0,186.7698,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1655.72,684.94,0.0 -256,16384,4608,2048,ck,13,0,201.776,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1532.58,961.4,0.0 -256,16384,4608,4096,cktile,28,0,334.3939,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1849.54,708.68,0.0 -256,32768,4608,2048,ck,13,0,393.975,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1569.83,960.81,0.0 -256,32768,4608,4096,cktile,26,0,663.6484,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1863.86,685.73,0.0 -256,1,5120,4096,ck,7,0,12.8459,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.27,1633.66,0.0 -256,2,5120,4096,ck,7,0,12.7807,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.56,1643.12,0.0 -256,4,5120,4096,ck,7,0,12.7897,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.12,1644.2,0.0 -256,8,5120,4096,ck,7,0,12.7339,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,26.35,1655.91,0.0 -256,16,5120,4096,ck,7,0,11.8292,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,56.73,1792.25,0.0 -256,32,5120,4096,ck,7,0,11.7346,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,114.38,1826.25,0.0 -256,64,5120,4096,ck,12,0,12.6157,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,212.78,1735.06,0.0 -256,128,5120,4096,ck,17,0,14.6196,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,367.23,1560.0,0.0 -256,256,5120,4096,ck,17,0,20.7969,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,516.3,1184.87,0.0 -256,512,5120,4096,ck,14,0,27.981,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,767.48,1011.81,0.0 -256,1024,5120,4096,ck,1,0,42.2835,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,1015.75,843.16,0.0 -256,2048,5120,4096,cktile,11,0,57.9764,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1481.63,868.14,0.0 -256,4096,5120,4096,cktile,28,0,113.4632,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1514.14,702.36,0.0 -256,8192,5120,4096,ck,13,0,204.8152,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1677.6,675.79,0.0 -256,16384,5120,4096,cktile,27,0,385.5837,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1782.22,663.55,0.0 -256,32768,5120,4096,ck,13,0,739.9087,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1857.51,663.24,0.0 -256,1,6144,2048,ck,7,0,7.9306,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.17,1588.44,0.0 -256,2,6144,2048,ck,7,0,7.8348,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.42,1609.69,0.0 -256,4,6144,2048,ck,7,0,7.9327,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.69,1593.44,0.0 -256,8,6144,2048,ck,7,0,7.9833,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.22,1590.52,0.0 -256,16,6144,2048,ck,7,0,7.3427,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.84,1744.9,0.0 -256,32,6144,2048,ck,7,0,7.54,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,106.8,1729.66,0.0 -256,64,6144,2048,ck,12,0,8.3389,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,193.14,1618.97,0.0 -256,128,6144,2048,ck,11,0,9.9764,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,322.88,1445.2,0.0 -256,256,6144,2048,ck,12,0,12.3901,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,519.97,1311.77,0.0 -256,512,6144,2048,ck,14,0,17.6425,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,730.33,1129.26,0.0001 -256,1024,6144,2048,ck,14,0,24.9928,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1031.09,1090.83,0.0002 -256,2048,6144,2048,ck,13,0,44.574,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1156.27,940.98,0.0 -256,4096,6144,2048,ck,13,0,77.4893,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1330.24,920.17,0.0 -256,8192,6144,2048,ck,13,0,141.6849,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1455.05,917.69,0.0 -256,16384,6144,2048,ck,13,0,269.0614,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1532.43,919.73,0.0 -256,32768,6144,2048,ck,13,0,513.6492,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1605.44,939.06,0.0 -256,1,8704,4096,ck,7,0,14.0155,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.09,2545.26,0.0 -256,2,8704,4096,ck,7,0,13.6679,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.43,2611.56,0.0 -256,4,8704,4096,ck,7,0,13.4692,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.18,2653.28,0.0 -256,8,8704,4096,ck,7,0,13.631,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.85,2628.1,0.0 -256,16,8704,4096,ck,7,0,12.8177,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,89.01,2808.28,0.0 -256,32,8704,4096,ck,12,0,13.5796,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,168.02,2676.05,0.0 -256,64,8704,4096,ck,17,0,15.4234,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,295.88,2400.76,0.0 -256,128,8704,4096,ck,10,0,21.5622,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,423.28,1781.08,0.0226 -256,256,8704,4096,ck,14,0,28.0912,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,649.8,1465.11,0.0 -256,512,8704,4096,ck,14,0,40.8265,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,894.2,1142.93,0.0002 -256,1024,8704,4096,cktile,26,0,58.5138,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1247.82,985.61,0.0 -256,2048,8704,4096,ck,13,0,109.9061,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1328.67,725.09,0.0 -256,4096,8704,4096,cktile,26,0,174.8011,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1670.8,707.84,0.0 -256,8192,8704,4096,ck,13,0,331.6284,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1761.36,638.7,0.0 -256,16384,8704,4096,cktile,26,0,625.4187,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1867.92,620.34,0.0 -256,32768,8704,4096,cktile,26,0,1226.8874,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1904.38,603.39,0.0 -256,1,9216,2048,ck,7,0,7.7504,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.87,2437.92,0.0 -256,2,9216,2048,ck,7,0,7.4979,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.07,2522.75,0.0 -256,4,9216,2048,ck,7,0,7.9418,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.01,2386.9,0.0 -256,8,9216,2048,ck,7,0,7.4624,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.47,2551.22,0.0 -256,16,9216,2048,ck,7,0,7.5502,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,80.0,2543.25,0.0 -256,32,9216,2048,ck,7,0,8.2702,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,146.06,2361.46,0.0 -256,64,9216,2048,ck,17,0,9.7695,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,247.29,2066.13,0.0 -256,128,9216,2048,ck,12,0,12.5155,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,386.07,1717.53,0.0 -256,256,9216,2048,ck,2,0,16.9832,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,569.01,1420.07,0.0 -256,512,9216,2048,ck,14,0,23.8481,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,810.44,1231.13,0.0002 -256,1024,9216,2048,cktile,26,0,37.0071,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1044.52,1076.71,0.0 -256,2048,9216,2048,ck,13,0,66.817,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1157.03,910.21,0.0 -256,4096,9216,2048,ck,13,0,112.9284,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1369.18,909.96,0.0 -256,8192,9216,2048,ck,13,0,206.2058,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1499.66,905.15,0.0 -256,16384,9216,2048,ck,13,0,387.8534,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1594.61,913.8,0.0 -256,32768,9216,2048,ck,13,0,741.7658,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1667.58,930.16,0.0 -256,1,10240,4096,ck,7,0,14.125,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.94,2971.16,0.0 -256,2,10240,4096,ck,7,0,14.5233,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.55,2891.37,0.0 -256,4,10240,4096,ck,7,0,14.1213,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.76,2977.16,0.0 -256,8,10240,4096,ck,7,0,14.1235,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.52,2983.65,0.0 -256,16,10240,4096,ck,7,0,13.1178,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,102.32,3227.39,0.0 -256,32,10240,4096,ck,12,0,14.0676,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,190.82,3037.44,0.0 -256,64,10240,4096,ck,17,0,16.0505,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,334.49,2711.19,0.0 -256,128,10240,4096,ck,17,0,22.1575,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,484.6,2034.92,0.0 -256,256,10240,4096,ck,14,0,29.4165,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,730.03,1639.71,0.0 -256,512,10240,4096,ck,14,0,43.0694,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,997.22,1266.0,0.0002 -256,1024,10240,4096,cktile,11,0,58.1448,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1477.33,1154.17,0.0 -256,2048,10240,4096,cktile,27,0,112.1256,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1532.2,822.96,0.0 -256,4096,10240,4096,ck,13,0,205.6209,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1671.02,693.54,0.0 -256,8192,10240,4096,cktile,26,0,376.1269,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1827.03,646.78,0.0 -256,16384,10240,4096,cktile,27,0,704.2104,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1951.67,631.34,0.0 -256,32768,10240,4096,cktile,28,0,1404.9529,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1956.49,603.04,0.0 -256,1,12288,2048,ck,7,0,8.545,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.89,2948.21,0.0 -256,2,12288,2048,ck,7,0,8.9331,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.27,2823.1,0.0 -256,4,12288,2048,ck,7,0,8.6807,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.19,2911.32,0.0 -256,8,12288,2048,ck,7,0,8.4403,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.71,3006.86,0.0 -256,16,12288,2048,ck,7,0,7.8145,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,103.05,3274.91,0.0 -256,32,12288,2048,ck,7,0,9.012,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,178.72,2887.02,0.0 -256,64,12288,2048,ck,11,0,10.7084,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,300.81,2509.22,0.0 -256,128,12288,2048,ck,12,0,13.0547,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,493.5,2188.77,0.0 -256,256,12288,2048,ck,15,0,17.9751,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,716.82,1779.22,0.0022 -256,512,12288,2048,ck,14,0,25.1576,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1024.33,1542.17,0.0002 -256,1024,12288,2048,ck,13,0,45.3706,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1135.97,1155.57,0.0 -256,2048,12288,2048,ck,13,0,79.2269,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1301.06,1005.87,0.0 -256,4096,12288,2048,ck,13,0,143.8759,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1432.89,932.87,0.0 -256,8192,12288,2048,ck,13,0,266.9287,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1544.67,911.37,0.0 -256,16384,12288,2048,ck,13,0,504.5565,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1634.37,914.41,0.0 -256,32768,12288,2048,ck,13,0,990.9729,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1664.29,905.76,0.0 -256,1,17408,4096,ck,7,0,17.2253,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.28,4141.7,0.0 -256,2,17408,4096,ck,7,0,16.4033,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,17.39,4351.62,0.0 -256,4,17408,4096,ck,7,0,16.5658,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,34.43,4313.64,0.0 -256,8,17408,4096,ck,7,0,16.643,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,68.55,4302.98,0.0 -256,16,17408,4096,ck,7,0,16.8605,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,135.33,4265.93,0.0 -256,32,17408,4096,ck,12,0,19.4233,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,234.94,3735.12,0.0 -256,64,17408,4096,ck,10,0,23.5642,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,387.32,3131.6,0.0206 -256,128,17408,4096,ck,2,0,30.9071,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,590.6,2468.17,0.0 -256,256,17408,4096,ck,14,0,41.891,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,871.48,1939.91,0.0002 -256,512,17408,4096,cktile,26,0,57.9873,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1259.15,1573.21,0.0 -256,1024,17408,4096,ck,13,0,109.526,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1333.28,1014.82,0.0 -256,2048,17408,4096,cktile,26,0,171.9678,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1698.33,878.04,0.0 -256,4096,17408,4096,ck,13,0,332.8382,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1754.95,693.09,0.0 -256,8192,17408,4096,cktile,11,0,617.9339,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1890.54,631.25,0.0 -256,16384,17408,4096,ck,13,0,1208.8428,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1932.81,586.38,0.0 -256,32768,17408,4096,cktile,27,0,2389.7849,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1955.37,563.39,0.0 -256,1,20480,4096,ck,7,0,17.7274,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.46,4734.54,0.0 -256,2,20480,4096,ck,7,0,17.4288,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.25,4818.24,0.0 -256,4,20480,4096,ck,7,0,17.4998,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,38.35,4803.84,0.0 -256,8,20480,4096,ck,7,0,17.7101,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,75.79,4756.98,0.0 -256,16,20480,4096,ck,7,0,18.2244,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,147.29,4642.51,0.0 -256,32,20480,4096,ck,12,0,20.1716,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,266.15,4230.1,0.0 -256,64,20480,4096,ck,10,0,24.8796,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,431.58,3487.58,0.021 -256,128,20480,4096,ck,2,0,32.1737,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,667.47,2786.54,0.0 -256,256,20480,4096,ck,14,0,44.3436,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,968.57,2151.84,0.0002 -256,512,20480,4096,cktile,26,0,58.6077,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1465.67,1824.93,0.0 -256,1024,20480,4096,cktile,11,0,112.2264,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1530.82,1158.58,0.0 -256,2048,20480,4096,cktile,26,0,198.7021,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1729.21,886.56,0.0 -256,4096,20480,4096,cktile,28,0,371.7097,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1848.74,722.16,0.0 -256,8192,20480,4096,cktile,11,0,708.3984,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1940.14,639.45,0.0 -256,16384,20480,4096,cktile,27,0,1387.4271,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1981.21,592.52,0.0 -256,32768,20480,4096,cktile,28,0,2721.3003,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2020.2,573.36,0.0 +gfx,cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,1,16,4096,cktile,12,0,9.9757,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,0.01,6.98,0.0 +gfx950,256,1,32,4096,cktile,12,0,9.9923,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,0.03,13.53,0.0 +gfx950,256,1,256,512,ck,7,0,2.6477,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.1,49.89,0.0 +gfx950,256,1,512,256,ck,5,0,2.7069,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0.1,48.89,0.0 +gfx950,256,1,512,512,ck,7,0,2.9061,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.18,90.73,0.0 +gfx950,256,1,512,1024,ck,6,0,4.124,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.25,127.63,0.0 +gfx950,256,1,512,2048,ck,6,0,5.2158,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.4,201.63,0.0 +gfx950,256,1,1024,256,ck,5,0,2.7529,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0.19,96.06,0.0 +gfx950,256,1,1024,512,ck,7,0,2.9323,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.36,179.67,0.0 +gfx950,256,1,1024,1024,cktile,1,0,4.3045,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,0.49,244.31,0.0 +gfx950,256,1,1024,2048,ck,7,0,5.17,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.81,406.43,0.0 +gfx950,256,1,1024,4096,ck,7,0,11.8308,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.71,355.04,0.0 +gfx950,256,1,1152,2048,ck,6,0,5.7928,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.81,408.03,0.0 +gfx950,256,1,1536,2048,ck,6,0,7.6446,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.82,412.17,0.0 +gfx950,256,1,2048,512,cktile,15,0,3.5207,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_4,0.6,299.14,0.0 +gfx950,256,1,2048,1024,ck,7,0,3.8919,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.08,540.17,0.0 +gfx950,256,1,2048,2048,ck,7,0,7.1436,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.17,588.0,0.0 +gfx950,256,1,2048,4096,ck,7,0,12.1487,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.38,691.17,0.0 +gfx950,256,1,2176,4096,ck,7,0,12.8835,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.38,692.46,0.0 +gfx950,256,1,2304,2048,ck,7,0,7.0303,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.34,672.13,0.0 +gfx950,256,1,2560,4096,ck,7,0,12.3531,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.7,849.58,0.0 +gfx950,256,1,3072,2048,ck,7,0,7.2249,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.74,871.94,0.0 +gfx950,256,1,4096,2048,ck,7,0,7.6088,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.2,1103.83,0.0 +gfx950,256,1,4352,4096,ck,7,0,12.4593,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.86,1431.75,0.0 +gfx950,256,1,4608,2048,ck,7,0,7.2729,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.6,1299.13,0.0 +gfx950,256,1,4608,4096,ck,7,0,12.6298,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.99,1495.49,0.0 +gfx950,256,1,5120,4096,ck,7,0,12.8459,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.27,1633.66,0.0 +gfx950,256,1,6144,2048,ck,7,0,7.9306,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.17,1588.44,0.0 +gfx950,256,1,8704,4096,ck,7,0,14.0155,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.09,2545.26,0.0 +gfx950,256,1,9216,2048,ck,7,0,7.7504,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.87,2437.92,0.0 +gfx950,256,1,10240,4096,ck,7,0,14.125,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.94,2971.16,0.0 +gfx950,256,1,12288,2048,ck,7,0,8.545,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.89,2948.21,0.0 +gfx950,256,1,17408,4096,ck,7,0,17.2253,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.28,4141.7,0.0 +gfx950,256,1,20480,4096,ck,7,0,17.7274,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.46,4734.54,0.0 +gfx950,256,2,16,4096,cktile,12,0,13.9708,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,0.02,5.28,0.0 +gfx950,256,2,32,4096,cktile,1,0,13.9482,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,0.04,9.99,0.0 +gfx950,256,2,256,512,ck,7,0,2.6554,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.2,50.13,0.0 +gfx950,256,2,512,256,ck,6,0,2.5406,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.21,52.6,0.0 +gfx950,256,2,512,512,ck,6,0,2.7919,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.38,94.99,0.0 +gfx950,256,2,512,1024,ck,6,0,3.8596,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.54,136.9,0.0 +gfx950,256,2,512,2048,ck,6,0,5.4904,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.76,192.1,0.0 +gfx950,256,2,1024,256,ck,5,0,2.7842,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0.38,95.81,0.0 +gfx950,256,2,1024,512,ck,7,0,2.9541,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.71,179.21,0.0 +gfx950,256,2,1024,1024,ck,7,0,3.8918,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.08,271.01,0.0 +gfx950,256,2,1024,2048,ck,7,0,5.2282,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.6,402.69,0.0 +gfx950,256,2,1024,4096,ck,7,0,12.0383,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.39,349.43,0.0 +gfx950,256,2,1152,2048,ck,7,0,5.8106,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.62,407.53,0.0 +gfx950,256,2,1536,2048,ck,7,0,7.5211,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.67,419.62,0.0 +gfx950,256,2,2048,512,ck,7,0,2.9658,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.41,356.66,0.0 +gfx950,256,2,2048,1024,ck,7,0,3.9292,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.13,536.34,0.0 +gfx950,256,2,2048,2048,ck,7,0,7.3283,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.29,574.02,0.0 +gfx950,256,2,2048,4096,ck,7,0,12.2679,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.74,685.12,0.0 +gfx950,256,2,2176,4096,ck,7,0,12.2827,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.9,727.02,0.0 +gfx950,256,2,2304,2048,ck,7,0,7.4911,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.52,631.67,0.0 +gfx950,256,2,2560,4096,ck,7,0,12.5047,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.35,840.02,0.0 +gfx950,256,2,3072,2048,ck,7,0,7.4827,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.36,842.99,0.0 +gfx950,256,2,4096,2048,ck,7,0,6.964,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.82,1207.51,0.0 +gfx950,256,2,4352,4096,ck,7,0,12.4143,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.74,1437.97,0.0 +gfx950,256,2,4608,2048,ck,7,0,7.1868,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.25,1316.26,0.0 +gfx950,256,2,4608,4096,ck,7,0,12.6641,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.96,1492.49,0.0 +gfx950,256,2,5120,4096,ck,7,0,12.7807,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.56,1643.12,0.0 +gfx950,256,2,6144,2048,ck,7,0,7.8348,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.42,1609.69,0.0 +gfx950,256,2,8704,4096,ck,7,0,13.6679,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.43,2611.56,0.0 +gfx950,256,2,9216,2048,ck,7,0,7.4979,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.07,2522.75,0.0 +gfx950,256,2,10240,4096,ck,7,0,14.5233,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.55,2891.37,0.0 +gfx950,256,2,12288,2048,ck,7,0,8.9331,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.27,2823.1,0.0 +gfx950,256,2,17408,4096,ck,7,0,16.4033,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,17.39,4351.62,0.0 +gfx950,256,2,20480,4096,ck,7,0,17.4288,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.25,4818.24,0.0 +gfx950,256,4,16,4096,cktile,0,0,14.2842,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,0.04,5.74,0.0 +gfx950,256,4,32,4096,cktile,2,0,14.2116,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,0.07,10.39,0.0 +gfx950,256,4,256,512,ck,7,0,2.4076,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.44,56.14,0.0 +gfx950,256,4,512,256,ck,4,0,2.6362,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,0.4,51.66,0.0 +gfx950,256,4,512,512,ck,6,0,3.0436,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.69,88.15,0.0 +gfx950,256,4,512,1024,ck,7,0,3.8855,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.08,137.04,0.0 +gfx950,256,4,512,2048,ck,7,0,5.2943,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.58,200.38,0.0 +gfx950,256,4,1024,256,ck,5,0,2.7982,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0.75,96.98,0.0 +gfx950,256,4,1024,512,ck,7,0,3.5522,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.18,150.48,0.0 +gfx950,256,4,1024,1024,ck,7,0,3.9714,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.11,267.13,0.0 +gfx950,256,4,1024,2048,ck,7,0,5.2853,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.17,399.89,0.0 +gfx950,256,4,1024,4096,ck,7,0,12.2115,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.75,345.48,0.0 +gfx950,256,4,1152,2048,ck,7,0,5.8616,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.22,405.47,0.0 +gfx950,256,4,1536,2048,ck,7,0,7.258,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.47,436.24,0.0 +gfx950,256,4,2048,512,ck,7,0,2.9866,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.81,357.27,0.0 +gfx950,256,4,2048,1024,ck,6,0,4.4419,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,3.78,476.74,0.0 +gfx950,256,4,2048,2048,ck,7,0,7.2284,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.64,583.65,0.0 +gfx950,256,4,2048,4096,ck,7,0,12.1475,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.52,693.26,0.0 +gfx950,256,4,2176,4096,ck,7,0,12.2395,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.83,730.97,0.0 +gfx950,256,4,2304,2048,ck,7,0,7.7175,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.89,614.86,0.0 +gfx950,256,4,2560,4096,ck,7,0,12.124,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.92,867.92,0.0 +gfx950,256,4,3072,2048,ck,7,0,7.7519,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.49,815.83,0.0 +gfx950,256,4,4096,1024,ck,7,0,5.181,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.48,816.67,0.0 +gfx950,256,4,4096,2048,ck,7,0,7.3066,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.18,1153.69,0.0 +gfx950,256,4,4352,4096,ck,7,0,12.3497,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.55,1447.56,0.0 +gfx950,256,4,4608,2048,ck,7,0,7.3386,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.29,1292.1,0.0 +gfx950,256,4,4608,4096,ck,7,0,12.6152,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.97,1500.38,0.0 +gfx950,256,4,5120,4096,ck,7,0,12.7897,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.12,1644.2,0.0 +gfx950,256,4,6144,2048,ck,7,0,7.9327,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.69,1593.44,0.0 +gfx950,256,4,8704,4096,ck,7,0,13.4692,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.18,2653.28,0.0 +gfx950,256,4,9216,2048,ck,7,0,7.9418,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.01,2386.9,0.0 +gfx950,256,4,10240,4096,ck,7,0,14.1213,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.76,2977.16,0.0 +gfx950,256,4,12288,2048,ck,7,0,8.6807,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.19,2911.32,0.0 +gfx950,256,4,17408,4096,ck,7,0,16.5658,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,34.43,4313.64,0.0 +gfx950,256,4,20480,4096,ck,7,0,17.4998,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,38.35,4803.84,0.0 +gfx950,256,8,16,4096,cktile,2,0,14.5794,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,0.07,6.76,0.0 +gfx950,256,8,32,4096,cktile,1,0,14.576,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,0.14,11.28,0.0 +gfx950,256,8,256,512,ck,7,0,2.6846,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.78,51.88,0.0 +gfx950,256,8,512,256,ck,6,0,2.5849,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0.81,54.67,0.0 +gfx950,256,8,512,512,ck,6,0,2.8378,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,1.48,96.71,0.0 +gfx950,256,8,512,1024,ck,6,0,4.1566,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,2.02,130.08,0.0 +gfx950,256,8,512,2048,ck,6,0,5.404,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,3.1,198.58,0.0 +gfx950,256,8,1024,256,ck,7,0,2.6714,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.57,105.03,0.0 +gfx950,256,8,1024,512,ck,6,0,3.3901,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,2.47,160.69,0.0 +gfx950,256,8,1024,1024,ck,6,0,4.5254,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,3.71,237.14,0.0 +gfx950,256,8,1024,2048,ck,7,0,5.3151,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.31,400.73,0.0 +gfx950,256,8,1024,4096,ck,7,0,11.9359,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.62,355.52,0.0 +gfx950,256,8,1152,2048,ck,7,0,5.9408,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.35,402.99,0.0 +gfx950,256,8,1536,2048,ck,7,0,7.2516,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.94,439.45,0.0 +gfx950,256,8,2048,512,ck,7,0,3.0406,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.52,356.98,0.0 +gfx950,256,8,2048,1024,ck,7,0,3.964,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.46,539.38,0.0 +gfx950,256,8,2048,2048,ck,7,0,7.2237,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.29,587.44,0.0 +gfx950,256,8,2048,4096,ck,7,0,12.0807,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.11,699.81,0.0 +gfx950,256,8,2176,4096,ck,7,0,12.4327,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.47,722.33,0.0 +gfx950,256,8,2304,2048,ck,7,0,7.2733,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.38,656.08,0.0 +gfx950,256,8,2560,4096,ck,7,0,12.1307,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.83,870.48,0.0 +gfx950,256,8,3072,2048,ck,7,0,7.6378,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.18,832.31,0.0 +gfx950,256,8,4096,2048,ck,7,0,7.1699,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.72,1181.4,0.0 +gfx950,256,8,4352,4096,ck,7,0,12.4303,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.94,1442.3,0.0 +gfx950,256,8,4608,2048,ck,7,0,8.0515,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.75,1183.29,0.0 +gfx950,256,8,4608,4096,ck,7,0,12.5023,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,24.15,1518.19,0.0 +gfx950,256,8,5120,4096,ck,7,0,12.7339,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,26.35,1655.91,0.0 +gfx950,256,8,6144,2048,ck,7,0,7.9833,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.22,1590.52,0.0 +gfx950,256,8,8704,4096,ck,7,0,13.631,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.85,2628.1,0.0 +gfx950,256,8,9216,2048,ck,7,0,7.4624,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.47,2551.22,0.0 +gfx950,256,8,10240,4096,ck,7,0,14.1235,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.52,2983.65,0.0 +gfx950,256,8,12288,2048,ck,7,0,8.4403,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.71,3006.86,0.0 +gfx950,256,8,17408,4096,ck,7,0,16.643,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,68.55,4302.98,0.0 +gfx950,256,8,20480,4096,ck,7,0,17.7101,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,75.79,4756.98,0.0 +gfx950,256,16,16,4096,cktile,12,0,15.0983,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,0.14,8.72,0.0 +gfx950,256,16,32,4096,cktile,1,0,15.1038,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,0.28,13.08,0.0 +gfx950,256,16,256,512,ck,7,0,2.7194,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.54,54.22,0.0 +gfx950,256,16,512,256,ck,6,0,2.6211,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,1.6,57.82,0.0 +gfx950,256,16,512,512,ck,6,0,2.8905,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,2.9,99.19,0.0 +gfx950,256,16,512,1024,ck,7,0,3.9659,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.23,140.46,0.0 +gfx950,256,16,512,2048,ck,7,0,5.2571,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.38,208.81,0.0 +gfx950,256,16,1024,256,ck,7,0,2.7287,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.07,109.58,0.0 +gfx950,256,16,1024,512,ck,7,0,3.0314,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.53,186.46,0.0 +gfx950,256,16,1024,1024,ck,7,0,4.569,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.34,240.26,0.0 +gfx950,256,16,1024,2048,ck,7,0,5.1776,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.96,417.7,0.0 +gfx950,256,16,1024,4096,ck,7,0,11.4238,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.75,375.76,0.0 +gfx950,256,16,1152,2048,ck,7,0,5.5125,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.7,440.62,0.0 +gfx950,256,16,1536,2048,ck,7,0,7.5651,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.31,426.65,0.0 +gfx950,256,16,2048,512,ck,7,0,3.0452,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.02,368.55,0.0 +gfx950,256,16,2048,1024,ck,7,0,3.9842,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.84,546.93,0.0 +gfx950,256,16,2048,2048,ck,7,0,6.856,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.58,626.11,0.0 +gfx950,256,16,2048,4096,ck,7,0,11.2919,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.77,754.49,0.0 +gfx950,256,16,2176,4096,ck,7,0,11.3777,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.07,795.25,0.0 +gfx950,256,16,2304,2048,ck,7,0,6.7265,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.45,717.33,0.0 +gfx950,256,16,2560,4096,ck,7,0,12.0651,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,27.81,881.32,0.0 +gfx950,256,16,3072,2048,ck,7,0,6.8232,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,29.51,941.28,0.0 +gfx950,256,16,4096,1024,ck,7,0,5.1635,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.99,840.86,0.0 +gfx950,256,16,4096,2048,ck,7,0,7.2391,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,37.08,1181.42,0.0 +gfx950,256,16,4352,4096,ck,7,0,11.3916,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.07,1582.8,0.0 +gfx950,256,16,4608,2048,ck,7,0,7.28,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.48,1321.07,0.0 +gfx950,256,16,5120,4096,ck,7,0,11.8292,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,56.73,1792.25,0.0 +gfx950,256,16,6144,2048,ck,7,0,7.3427,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.84,1744.9,0.0 +gfx950,256,16,8704,4096,ck,7,0,12.8177,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,89.01,2808.28,0.0 +gfx950,256,16,9216,2048,ck,7,0,7.5502,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,80.0,2543.25,0.0 +gfx950,256,16,10240,4096,ck,7,0,13.1178,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,102.32,3227.39,0.0 +gfx950,256,16,12288,2048,ck,7,0,7.8145,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,103.05,3274.91,0.0 +gfx950,256,16,17408,4096,ck,7,0,16.8605,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,135.33,4265.93,0.0 +gfx950,256,16,20480,4096,ck,7,0,18.2244,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,147.29,4642.51,0.0 +gfx950,256,32,16,4096,cktile,2,0,15.196,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,0.28,13.01,0.0 +gfx950,256,32,32,4096,cktile,12,0,15.3304,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,0.55,17.23,0.0 +gfx950,256,32,256,512,ck,12,0,2.9842,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,2.81,54.9,0.0 +gfx950,256,32,512,256,ck,7,0,2.9498,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.84,58.32,0.0 +gfx950,256,32,512,512,ck,7,0,3.2402,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.18,96.07,0.0 +gfx950,256,32,512,1024,ck,7,0,4.1647,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.06,141.62,0.0 +gfx950,256,32,512,2048,ck,7,0,5.4239,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.37,211.45,0.0 +gfx950,256,32,1024,256,ck,7,0,3.0511,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.5,110.08,0.0 +gfx950,256,32,1024,512,ck,7,0,3.2677,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.27,185.52,0.0 +gfx950,256,32,1024,1024,ck,7,0,4.8239,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.91,237.75,0.0 +gfx950,256,32,1024,2048,ck,7,0,5.3868,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,24.92,413.65,0.0 +gfx950,256,32,1024,4096,ck,7,0,11.2193,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.93,391.37,0.0 +gfx950,256,32,1152,2048,ck,7,0,5.4689,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,27.61,456.87,0.0 +gfx950,256,32,1536,2048,ck,12,0,7.6017,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,26.48,435.37,0.0 +gfx950,256,32,2048,512,ck,7,0,3.2685,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.53,365.93,0.0 +gfx950,256,32,2048,1024,ck,7,0,4.2361,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,31.68,533.74,0.0 +gfx950,256,32,2048,2048,ck,7,0,7.1317,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,37.64,615.69,0.0 +gfx950,256,32,2048,4096,ck,7,0,11.4391,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.93,756.24,0.0 +gfx950,256,32,2176,4096,ck,7,0,11.4366,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.88,802.97,0.0 +gfx950,256,32,2304,2048,ck,7,0,6.6844,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,45.18,737.78,0.0 +gfx950,256,32,2560,4096,ck,7,0,11.1263,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,60.32,968.94,0.0 +gfx950,256,32,3072,2048,ck,7,0,7.4069,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.36,884.8,0.0 +gfx950,256,32,4096,2048,ck,7,0,7.7878,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,68.94,1119.22,0.0 +gfx950,256,32,4352,4096,ck,7,0,11.4221,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,99.88,1596.5,0.0 +gfx950,256,32,4608,2048,ck,7,0,7.7986,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,77.45,1256.33,0.0 +gfx950,256,32,4608,4096,ck,7,0,11.6068,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,104.07,1662.85,0.0 +gfx950,256,32,5120,4096,ck,7,0,11.7346,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,114.38,1826.25,0.0 +gfx950,256,32,6144,2048,ck,7,0,7.54,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,106.8,1729.66,0.0 +gfx950,256,32,8704,4096,ck,12,0,13.5796,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,168.02,2676.05,0.0 +gfx950,256,32,9216,2048,ck,7,0,8.2702,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,146.06,2361.46,0.0 +gfx950,256,32,10240,4096,ck,12,0,14.0676,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,190.82,3037.44,0.0 +gfx950,256,32,12288,2048,ck,7,0,9.012,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,178.72,2887.02,0.0 +gfx950,256,32,17408,4096,ck,12,0,19.4233,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,234.94,3735.12,0.0 +gfx950,256,32,20480,4096,ck,12,0,20.1716,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,266.15,4230.1,0.0 +gfx950,256,64,16,4096,cktile,2,0,15.3293,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,0.55,21.51,0.0 +gfx950,256,64,32,4096,cktile,1,0,15.251,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,1.1,26.05,0.0 +gfx950,256,64,256,512,ck,6,0,3.6491,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,4.6,53.88,0.0 +gfx950,256,64,512,256,ck,7,0,2.9421,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.7,72.39,0.0 +gfx950,256,64,512,512,ck,7,0,3.2447,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.34,111.09,0.0 +gfx950,256,64,512,1024,ck,12,0,4.617,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,14.54,141.94,0.0 +gfx950,256,64,512,2048,ck,12,0,6.0064,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,22.35,207.31,0.0 +gfx950,256,64,1024,256,ck,5,0,3.0742,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,10.91,133.24,0.0 +gfx950,256,64,1024,512,ck,6,0,3.6706,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,18.28,187.47,0.0 +gfx950,256,64,1024,1024,ck,7,0,4.2391,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,31.66,293.74,0.0 +gfx950,256,64,1024,2048,ck,7,0,5.3999,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.71,436.91,0.0 +gfx950,256,64,1024,4096,ck,7,0,11.0075,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,48.77,416.76,0.0 +gfx950,256,64,1152,2048,ck,7,0,6.1787,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,48.88,426.92,0.0 +gfx950,256,64,1536,2048,ck,7,0,7.3449,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.82,472.9,0.0 +gfx950,256,64,2048,512,ck,7,0,3.3002,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.67,407.09,0.0 +gfx950,256,64,2048,1024,ck,7,0,4.3176,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,62.17,561.62,0.0 +gfx950,256,64,2048,2048,ck,7,0,7.1316,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,75.28,643.27,0.0 +gfx950,256,64,2048,4096,ck,7,0,11.2775,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,95.21,790.33,0.0 +gfx950,256,64,2176,4096,ck,7,0,11.3644,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,100.39,831.86,0.0 +gfx950,256,64,2304,2048,ck,7,0,6.8275,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,88.46,753.51,0.0 +gfx950,256,64,2560,4096,ck,7,0,11.6227,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,115.48,952.93,0.0 +gfx950,256,64,3072,2048,ck,7,0,7.9281,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,101.58,859.69,0.0 +gfx950,256,64,4096,2048,ck,7,0,7.1537,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,150.1,1264.24,0.0 +gfx950,256,64,4352,4096,ck,7,0,11.9047,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,191.66,1566.19,0.0 +gfx950,256,64,4608,2048,ck,6,0,8.0492,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,150.07,1262.0,0.0 +gfx950,256,64,4608,4096,ck,12,0,12.3488,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,195.64,1597.43,0.0 +gfx950,256,64,5120,4096,ck,12,0,12.6157,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,212.78,1735.06,0.0 +gfx950,256,64,6144,2048,ck,12,0,8.3389,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,193.14,1618.97,0.0 +gfx950,256,64,8704,4096,ck,17,0,15.4234,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,295.88,2400.76,0.0 +gfx950,256,64,9216,2048,ck,17,0,9.7695,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,247.29,2066.13,0.0 +gfx950,256,64,10240,4096,ck,17,0,16.0505,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,334.49,2711.19,0.0 +gfx950,256,64,12288,2048,ck,11,0,10.7084,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,300.81,2509.22,0.0 +gfx950,256,64,17408,4096,ck,10,0,23.5642,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,387.32,3131.6,0.0206 +gfx950,256,64,20480,4096,ck,10,0,24.8796,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,431.58,3487.58,0.021 +gfx950,256,128,16,4096,cktile,2,0,15.5018,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,1.08,38.31,0.0 +gfx950,256,128,32,4096,cktile,12,0,15.6217,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,2.15,42.48,0.0 +gfx950,256,128,256,512,ck,7,0,3.2278,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.4,81.21,0.0 +gfx950,256,128,512,256,ck,10,0,3.388,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,9.9,87.05,0.0 +gfx950,256,128,512,512,ck,7,0,3.2281,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.79,142.11,0.0 +gfx950,256,128,512,1024,ck,6,0,4.7301,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,28.38,166.26,0.0 +gfx950,256,128,512,2048,ck,7,0,5.4576,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.19,264.18,0.0 +gfx950,256,128,1024,256,ck,7,0,2.9858,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.48,186.57,0.0 +gfx950,256,128,1024,512,ck,7,0,3.3077,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.58,257.57,0.0 +gfx950,256,128,1024,1024,ck,7,0,4.2871,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,62.61,336.31,0.0 +gfx950,256,128,1024,2048,ck,7,0,5.8742,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,91.39,446.26,0.0 +gfx950,256,128,1024,4096,asm,5,6,7.7633,_ZN5aiter42fp8gemm_bf16_blockscale_BpreShuffle_32x128E,138.31,641.57,0.0001 +gfx950,256,128,1152,2048,ck,7,0,6.7713,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,89.2,430.69,0.0 +gfx950,256,128,1536,2048,ck,7,0,7.0358,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,114.46,540.25,0.0 +gfx950,256,128,2048,512,ck,7,0,3.3974,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,79.01,482.25,0.0 +gfx950,256,128,2048,1024,ck,7,0,5.065,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,106.0,543.44,0.0 +gfx950,256,128,2048,2048,ck,7,0,7.0523,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,152.25,706.26,0.0 +gfx950,256,128,2048,4096,ck,7,0,11.2291,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,191.24,840.42,0.0 +gfx950,256,128,2176,4096,ck,7,0,11.9866,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,190.35,833.78,0.0 +gfx950,256,128,2304,2048,ck,7,0,7.3232,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,164.95,760.67,0.0 +gfx950,256,128,2560,4096,ck,7,0,12.3257,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,217.79,946.43,0.0 +gfx950,256,128,3072,2048,ck,7,0,7.9906,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,201.56,918.58,0.0 +gfx950,256,128,4096,1024,ck,7,0,5.6283,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,190.78,954.81,0.0 +gfx950,256,128,4096,2048,ck,7,0,7.6378,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,281.17,1269.91,0.0 +gfx950,256,128,4352,4096,ck,17,0,14.242,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,320.42,1366.68,0.0 +gfx950,256,128,4608,2048,ck,12,0,9.9319,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,243.25,1095.36,0.0 +gfx950,256,128,4608,4096,ck,17,0,14.28,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,338.36,1441.06,0.0 +gfx950,256,128,5120,4096,ck,17,0,14.6196,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,367.23,1560.0,0.0 +gfx950,256,128,6144,2048,ck,11,0,9.9764,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,322.88,1445.2,0.0 +gfx950,256,128,8704,4096,ck,10,0,21.5622,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,423.28,1781.08,0.0226 +gfx950,256,128,9216,2048,ck,12,0,12.5155,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,386.07,1717.53,0.0 +gfx950,256,128,10240,4096,ck,17,0,22.1575,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,484.6,2034.92,0.0 +gfx950,256,128,12288,2048,ck,12,0,13.0547,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,493.5,2188.77,0.0 +gfx950,256,128,17408,4096,ck,2,0,30.9071,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,590.6,2468.17,0.0 +gfx950,256,128,20480,4096,ck,2,0,32.1737,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,667.47,2786.54,0.0 +gfx950,256,256,16,4096,cktile,0,0,15.5628,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,2.16,72.11,0.0 +gfx950,256,256,32,4096,cktile,0,0,15.6541,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,4.29,76.4,0.0 +gfx950,256,256,256,512,ck,7,0,3.2341,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.75,121.58,0.0 +gfx950,256,256,512,256,ck,7,0,2.9734,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.57,154.29,0.0 +gfx950,256,256,512,512,ck,7,0,3.2856,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.85,199.46,0.0 +gfx950,256,256,512,1024,ck,7,0,4.2998,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,62.43,243.87,0.0 +gfx950,256,256,512,2048,ck,12,0,6.0343,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,88.97,304.1,0.0 +gfx950,256,256,1024,256,ck,7,0,3.0616,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.84,278.28,0.0 +gfx950,256,256,1024,512,ck,7,0,3.4184,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,78.53,345.09,0.0 +gfx950,256,256,1024,1024,ck,7,0,4.5507,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,117.98,403.24,0.0 +gfx950,256,256,1024,2048,ck,7,0,6.8139,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,157.58,461.66,0.0 +gfx950,256,256,1024,4096,ck,7,0,11.2167,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,191.45,514.16,0.0 +gfx950,256,256,1152,2048,ck,7,0,6.8666,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,175.92,505.84,0.0 +gfx950,256,256,1536,2048,ck,12,0,6.8003,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,236.84,655.33,0.0 +gfx950,256,256,2048,512,ck,12,0,3.9772,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,134.99,560.25,0.0 +gfx950,256,256,2048,1024,ck,12,0,5.3818,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,199.51,633.22,0.0 +gfx950,256,256,2048,2048,ck,12,0,7.5893,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,282.96,759.91,0.0 +gfx950,256,256,2048,4096,ck,12,0,12.1126,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,354.59,865.69,0.0 +gfx950,256,256,2176,4096,ck,17,0,13.8813,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,328.74,797.88,0.0 +gfx950,256,256,2304,2048,ck,17,0,9.3755,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,257.68,685.03,0.0 +gfx950,256,256,2560,4096,ck,17,0,13.8476,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,387.7,927.6,0.0 +gfx950,256,256,3072,2048,ck,11,0,9.3418,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,344.82,897.96,0.0 +gfx950,256,256,4096,1024,ck,12,0,6.7543,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,317.94,970.29,0.0 +gfx950,256,256,4096,2048,ck,11,0,9.726,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,441.6,1132.02,0.0 +gfx950,256,256,4352,4096,ck,17,0,20.0387,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,455.46,1053.09,0.0 +gfx950,256,256,4608,2048,ck,10,0,11.8567,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,407.52,1039.14,0.0214 +gfx950,256,256,4608,4096,ck,10,0,20.4557,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,472.42,1089.29,0.0198 +gfx950,256,256,5120,4096,ck,17,0,20.7969,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,516.3,1184.87,0.0 +gfx950,256,256,6144,2048,ck,12,0,12.3901,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,519.97,1311.77,0.0 +gfx950,256,256,8704,4096,ck,14,0,28.0912,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,649.8,1465.11,0.0 +gfx950,256,256,9216,2048,ck,2,0,16.9832,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,569.01,1420.07,0.0 +gfx950,256,256,10240,4096,ck,14,0,29.4165,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,730.03,1639.71,0.0 +gfx950,256,256,12288,2048,ck,15,0,17.9751,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,716.82,1779.22,0.0022 +gfx950,256,256,17408,4096,ck,14,0,41.891,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,871.48,1939.91,0.0002 +gfx950,256,256,20480,4096,ck,14,0,44.3436,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,968.57,2151.84,0.0002 +gfx950,256,512,16,4096,cktile,14,0,16.0332,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_3,4.19,135.91,0.0 +gfx950,256,512,32,4096,cktile,12,0,16.3069,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,8.23,138.65,0.0 +gfx950,256,512,256,512,ck,7,0,3.2978,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.7,198.73,0.0 +gfx950,256,512,512,256,ck,12,0,3.3898,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,39.59,232.0,0.0 +gfx950,256,512,512,512,ck,6,0,3.8003,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,70.64,275.92,0.0 +gfx950,256,512,512,1024,ck,12,0,4.7538,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,112.94,330.86,0.0 +gfx950,256,512,512,2048,ck,7,0,6.4199,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,167.25,408.33,0.0 +gfx950,256,512,1024,256,ck,12,0,3.4576,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,77.64,416.99,0.0 +gfx950,256,512,1024,512,ck,7,0,3.9003,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,137.65,470.48,0.0 +gfx950,256,512,1024,1024,ck,12,0,5.1465,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,208.64,509.36,0.0 +gfx950,256,512,1024,2048,ck,12,0,7.3305,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,292.95,572.17,0.0 +gfx950,256,512,1024,4096,ck,12,0,11.9299,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,360.02,615.26,0.0 +gfx950,256,512,1152,2048,ck,12,0,8.3546,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,289.17,549.1,0.0 +gfx950,256,512,1536,2048,ck,12,0,8.9195,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,361.14,646.58,0.0 +gfx950,256,512,2048,512,ck,11,0,5.0404,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,213.03,676.11,0.0 +gfx950,256,512,2048,1024,ck,11,0,6.3963,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,335.74,737.71,0.0 +gfx950,256,512,2048,2048,ck,17,0,9.7291,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,441.46,754.44,0.0 +gfx950,256,512,2048,4096,ck,17,0,13.9323,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,616.55,903.15,0.0 +gfx950,256,512,2176,4096,ck,15,0,19.7041,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,463.19,671.85,0.0002 +gfx950,256,512,2304,2048,ck,12,0,12.0763,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,400.11,672.93,0.0 +gfx950,256,512,2560,4096,ck,17,0,20.1587,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,532.64,754.23,0.0 +gfx950,256,512,3072,2048,ck,12,0,12.7345,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,505.91,823.41,0.0 +gfx950,256,512,4096,2048,ck,17,0,13.313,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,645.23,1023.92,0.0 +gfx950,256,512,4352,4096,ck,14,0,26.8411,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,680.06,908.29,0.0 +gfx950,256,512,4608,2048,ck,2,0,16.4601,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,587.1,923.71,0.0 +gfx950,256,512,4608,4096,ck,2,0,27.5561,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,701.38,932.28,0.0 +gfx950,256,512,5120,4096,ck,14,0,27.981,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,767.48,1011.81,0.0 +gfx950,256,512,6144,2048,ck,14,0,17.6425,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,730.33,1129.26,0.0001 +gfx950,256,512,8704,4096,ck,14,0,40.8265,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,894.2,1142.93,0.0002 +gfx950,256,512,9216,2048,ck,14,0,23.8481,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,810.44,1231.13,0.0002 +gfx950,256,512,10240,4096,ck,14,0,43.0694,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,997.22,1266.0,0.0002 +gfx950,256,512,12288,2048,ck,14,0,25.1576,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1024.33,1542.17,0.0002 +gfx950,256,512,17408,4096,cktile,26,0,57.9873,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1259.15,1573.21,0.0 +gfx950,256,512,20480,4096,cktile,26,0,58.6077,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1465.67,1824.93,0.0 +gfx950,256,1024,16,4096,cktile,2,0,17.2261,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,7.79,249.19,0.0 +gfx950,256,1024,32,4096,cktile,2,0,17.4819,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,15.36,251.17,0.0 +gfx950,256,1024,256,512,ck,12,0,3.9174,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,68.52,301.13,0.0 +gfx950,256,1024,512,256,ck,12,0,3.4621,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,77.54,416.45,0.0 +gfx950,256,1024,512,512,ck,6,0,3.9298,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,136.62,466.95,0.0 +gfx950,256,1024,512,1024,ck,12,0,5.4144,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,198.31,484.16,0.0 +gfx950,256,1024,512,2048,ck,12,0,6.9139,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,310.6,606.65,0.0 +gfx950,256,1024,1024,256,ck,12,0,4.2083,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,127.57,622.92,0.0 +gfx950,256,1024,1024,512,ck,11,0,4.9934,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,215.03,629.98,0.0 +gfx950,256,1024,1024,1024,ck,17,0,6.6739,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,321.77,628.46,0.0 +gfx950,256,1024,1024,2048,ck,17,0,10.2081,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,420.74,616.32,0.0 +gfx950,256,1024,1024,4096,ck,17,0,14.2436,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,603.07,736.17,0.0 +gfx950,256,1024,1152,2048,ck,17,0,12.1623,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,397.28,560.4,0.0 +gfx950,256,1024,1536,2048,ck,17,0,12.7918,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,503.64,655.78,0.0 +gfx950,256,1024,2048,512,ck,17,0,6.2043,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,346.13,929.54,0.0 +gfx950,256,1024,2048,1024,ck,15,0,9.4687,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,453.6,775.19,0.0005 +gfx950,256,1024,2048,2048,ck,17,0,13.1692,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,652.27,796.23,0.0 +gfx950,256,1024,2048,4096,ck,17,0,21.7907,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,788.4,769.93,0.0 +gfx950,256,1024,2176,4096,ck,14,0,26.8779,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,679.13,653.46,0.0 +gfx950,256,1024,2304,2048,ck,2,0,16.7332,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,577.52,689.31,0.0 +gfx950,256,1024,2560,4096,ck,14,0,27.5173,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,780.41,724.02,0.0001 +gfx950,256,1024,3072,2048,ck,14,0,17.6286,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,730.91,832.74,0.0001 +gfx950,256,1024,4096,1024,ck,14,0,12.7825,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,672.01,1066.42,0.0 +gfx950,256,1024,4096,2048,ck,14,0,18.6898,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,919.21,1009.88,0.0002 +gfx950,256,1024,4352,4096,ck,14,0,40.6091,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,898.99,761.73,0.0001 +gfx950,256,1024,4608,2048,ck,14,0,23.4479,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,824.27,894.39,0.0002 +gfx950,256,1024,4608,4096,ck,14,0,40.7002,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,949.74,798.67,0.0001 +gfx950,256,1024,5120,4096,ck,1,0,42.2835,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,1015.75,843.16,0.0 +gfx950,256,1024,6144,2048,ck,14,0,24.9928,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1031.09,1090.83,0.0002 +gfx950,256,1024,8704,4096,cktile,26,0,58.5138,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1247.82,985.61,0.0 +gfx950,256,1024,9216,2048,cktile,26,0,37.0071,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1044.52,1076.71,0.0 +gfx950,256,1024,10240,4096,cktile,11,0,58.1448,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1477.33,1154.17,0.0 +gfx950,256,1024,12288,2048,ck,13,0,45.3706,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1135.97,1155.57,0.0 +gfx950,256,1024,17408,4096,ck,13,0,109.526,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1333.28,1014.82,0.0 +gfx950,256,1024,20480,4096,cktile,11,0,112.2264,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1530.82,1158.58,0.0 +gfx950,256,2048,16,4096,cktile,2,0,17.6907,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,15.17,481.59,0.0 +gfx950,256,2048,32,4096,cktile,1,0,17.7872,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,30.18,486.35,0.0 +gfx950,256,2048,256,512,ck,6,0,3.9969,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,134.32,557.49,0.0 +gfx950,256,2048,512,256,ck,4,0,4.3267,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,124.08,636.17,0.0 +gfx950,256,2048,512,512,ck,11,0,4.9635,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,216.33,686.59,0.0 +gfx950,256,2048,512,1024,ck,9,0,7.2831,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,294.86,647.88,0.0 +gfx950,256,2048,512,2048,ck,11,0,10.3233,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,416.05,711.02,0.0 +gfx950,256,2048,1024,256,ck,17,0,5.7494,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,186.76,866.31,0.0 +gfx950,256,2048,1024,512,ck,14,0,6.599,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,325.43,873.95,0.0 +gfx950,256,2048,1024,1024,ck,17,0,9.7776,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,439.27,750.7,0.0 +gfx950,256,2048,1024,2048,ck,17,0,13.7534,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,624.57,762.41,0.0 +gfx950,256,2048,1024,4096,ck,17,0,21.6011,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,795.32,776.68,0.0 +gfx950,256,2048,1152,2048,ck,15,0,16.8691,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,572.86,668.22,0.0011 +gfx950,256,2048,1536,2048,ck,14,0,18.0163,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,715.18,756.62,0.0001 +gfx950,256,2048,2048,512,ck,17,0,10.3309,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,415.74,1014.99,0.0 +gfx950,256,2048,2048,1024,ck,14,0,13.6276,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,630.33,923.34,0.0 +gfx950,256,2048,2048,2048,ck,14,0,18.8518,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,911.31,889.95,0.0002 +gfx950,256,2048,2048,4096,ck,14,0,30.6689,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1120.34,820.56,0.0002 +gfx950,256,2048,2176,4096,ck,14,0,39.7142,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,919.25,660.08,0.0002 +gfx950,256,2048,2304,2048,ck,14,0,23.6368,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,817.68,776.34,0.0001 +gfx950,256,2048,2560,4096,ck,14,0,41.4725,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1035.62,707.94,0.0002 +gfx950,256,2048,3072,2048,ck,14,0,25.6421,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1004.98,899.64,0.0003 +gfx950,256,2048,4096,1024,ck,13,0,21.1184,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,813.5,1092.35,0.0 +gfx950,256,2048,4096,2048,ck,13,0,29.6053,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1160.59,991.72,0.0 +gfx950,256,2048,4352,4096,cktile,26,0,58.3217,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1251.93,755.13,0.0 +gfx950,256,2048,4608,2048,cktile,28,0,37.2525,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1037.64,872.58,0.0 +gfx950,256,2048,4608,4096,cktile,26,0,57.5625,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1343.05,801.52,0.0 +gfx950,256,2048,5120,4096,cktile,11,0,57.9764,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1481.63,868.14,0.0 +gfx950,256,2048,6144,2048,ck,13,0,44.574,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1156.27,940.98,0.0 +gfx950,256,2048,8704,4096,ck,13,0,109.9061,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1328.67,725.09,0.0 +gfx950,256,2048,9216,2048,ck,13,0,66.817,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1157.03,910.21,0.0 +gfx950,256,2048,10240,4096,cktile,27,0,112.1256,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1532.2,822.96,0.0 +gfx950,256,2048,12288,2048,ck,13,0,79.2269,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1301.06,1005.87,0.0 +gfx950,256,2048,17408,4096,cktile,26,0,171.9678,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1698.33,878.04,0.0 +gfx950,256,2048,20480,4096,cktile,26,0,198.7021,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1729.21,886.56,0.0 +gfx950,256,4096,16,4096,cktile,2,0,17.9746,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,29.87,944.32,0.0 +gfx950,256,4096,32,4096,cktile,0,0,18.2014,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,58.99,943.36,0.0 +gfx950,256,4096,256,512,ck,12,0,5.1797,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,207.3,835.06,0.0 +gfx950,256,4096,512,256,ck,12,0,5.7231,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,187.62,938.99,0.0 +gfx950,256,4096,512,512,ck,16,0,6.0313,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,356.06,1086.6,0.0 +gfx950,256,4096,512,1024,ck,14,0,9.5983,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,447.47,928.59,0.0 +gfx950,256,4096,512,2048,ck,15,0,13.5435,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,634.25,1006.5,0.0009 +gfx950,256,4096,1024,256,ck,8,0,7.9335,a8w8_blockscale_bpreshuffle_1x128x128_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,270.69,1222.58,0.0 +gfx950,256,4096,1024,512,ck,16,0,10.3882,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,413.45,1059.86,0.0 +gfx950,256,4096,1024,1024,ck,14,0,13.1991,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,650.8,1032.76,0.0 +gfx950,256,4096,1024,2048,ck,14,0,19.096,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,899.66,988.39,0.0001 +gfx950,256,4096,1024,4096,ck,14,0,30.6367,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1121.52,958.33,0.0002 +gfx950,256,4096,1152,2048,ck,14,0,23.5712,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,819.96,856.35,0.0002 +gfx950,256,4096,1536,2048,ck,14,0,25.2772,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1019.49,954.11,0.0003 +gfx950,256,4096,2048,512,ck,16,0,15.7595,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,545.06,1264.19,0.0 +gfx950,256,4096,2048,1024,ck,13,0,20.6804,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,830.73,1115.48,0.0 +gfx950,256,4096,2048,2048,ck,13,0,29.3797,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1169.51,999.33,0.0 +gfx950,256,4096,2048,4096,ck,13,0,49.1486,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1398.2,853.39,0.0 +gfx950,256,4096,2176,4096,ck,14,0,63.494,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1149.94,685.35,0.0003 +gfx950,256,4096,2304,2048,cktile,26,0,38.0636,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1015.53,840.21,0.0 +gfx950,256,4096,2560,4096,cktile,28,0,59.7488,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1437.67,807.29,0.0 +gfx950,256,4096,3072,2048,ck,13,0,41.8638,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1231.13,951.8,0.0 +gfx950,256,4096,4096,1024,ck,13,0,38.902,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,883.24,1078.17,0.0 +gfx950,256,4096,4096,2048,ck,13,0,54.8859,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1252.04,917.02,0.0 +gfx950,256,4096,4352,4096,ck,13,0,106.0856,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1376.52,662.24,0.0 +gfx950,256,4096,4608,2048,ck,13,0,63.8104,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1211.55,870.93,0.0 +gfx950,256,4096,4608,4096,ck,13,0,107.8779,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1433.28,680.4,0.0 +gfx950,256,4096,5120,4096,cktile,28,0,113.4632,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1514.14,702.36,0.0 +gfx950,256,4096,6144,2048,ck,13,0,77.4893,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1330.24,920.17,0.0 +gfx950,256,4096,8704,4096,cktile,26,0,174.8011,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1670.8,707.84,0.0 +gfx950,256,4096,9216,2048,ck,13,0,112.9284,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1369.18,909.96,0.0 +gfx950,256,4096,10240,4096,ck,13,0,205.6209,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1671.02,693.54,0.0 +gfx950,256,4096,12288,2048,ck,13,0,143.8759,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1432.89,932.87,0.0 +gfx950,256,4096,17408,4096,ck,13,0,332.8382,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1754.95,693.09,0.0 +gfx950,256,4096,20480,4096,cktile,28,0,371.7097,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1848.74,722.16,0.0 +gfx950,256,8192,16,4096,cktile,1,0,25.2013,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,42.61,1344.46,0.0 +gfx950,256,8192,32,4096,cktile,2,0,25.1674,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_1,85.33,1359.29,0.0 +gfx950,256,8192,256,512,ck,9,0,6.6634,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,322.28,1278.58,0.0 +gfx950,256,8192,512,256,ck,14,0,7.7636,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,276.61,1367.51,0.0 +gfx950,256,8192,512,512,ck,1,0,10.1629,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,422.61,1263.92,0.0 +gfx950,256,8192,512,1024,ck,14,0,13.1835,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,651.57,1312.36,0.0 +gfx950,256,8192,512,2048,ck,14,0,18.8774,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,910.08,1388.67,0.0 +gfx950,256,8192,1024,256,ck,9,0,12.8029,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,335.47,1494.7,0.0 +gfx950,256,8192,1024,512,ck,11,0,15.7794,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,544.38,1362.27,0.0 +gfx950,256,8192,1024,1024,ck,13,0,20.8729,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,823.07,1255.91,0.0 +gfx950,256,8192,1024,2048,ck,13,0,29.8301,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1151.85,1195.15,0.0 +gfx950,256,8192,1024,4096,ck,13,0,50.1691,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1369.76,1086.84,0.0 +gfx950,256,8192,1152,2048,ck,14,0,38.0618,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1015.58,998.66,0.0003 +gfx950,256,8192,1536,2048,ck,13,0,44.5696,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1156.38,1011.65,0.0 +gfx950,256,8192,2048,512,ck,16,0,25.8314,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,665.08,1501.94,0.0 +gfx950,256,8192,2048,1024,ck,13,0,37.8569,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,907.62,1163.33,0.0 +gfx950,256,8192,2048,2048,ck,13,0,54.7439,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1255.29,996.02,0.0 +gfx950,256,8192,2048,4096,ck,13,0,91.6649,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1499.36,823.62,0.0 +gfx950,256,8192,2176,4096,ck,14,0,111.4756,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1309.96,700.77,0.0003 +gfx950,256,8192,2304,2048,ck,13,0,64.5936,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1196.86,917.19,0.0 +gfx950,256,8192,2560,4096,ck,13,0,114.6292,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1498.73,750.1,0.0 +gfx950,256,8192,3072,2048,ck,13,0,76.4197,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1348.86,960.49,0.0 +gfx950,256,8192,4096,1024,ck,14,0,69.6632,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,986.45,1143.96,0.0003 +gfx950,256,8192,4096,2048,ck,13,0,100.0583,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1373.59,922.21,0.0 +gfx950,256,8192,4352,4096,cktile,28,0,175.3808,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1665.28,699.53,0.0 +gfx950,256,8192,4608,2048,ck,13,0,112.5915,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1373.27,903.37,0.0 +gfx950,256,8192,4608,4096,cktile,11,0,186.7698,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1655.72,684.94,0.0 +gfx950,256,8192,5120,4096,ck,13,0,204.8152,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1677.6,675.79,0.0 +gfx950,256,8192,6144,2048,ck,13,0,141.6849,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1455.05,917.69,0.0 +gfx950,256,8192,8704,4096,ck,13,0,331.6284,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1761.36,638.7,0.0 +gfx950,256,8192,9216,2048,ck,13,0,206.2058,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1499.66,905.15,0.0 +gfx950,256,8192,10240,4096,cktile,26,0,376.1269,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1827.03,646.78,0.0 +gfx950,256,8192,12288,2048,ck,13,0,266.9287,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1544.67,911.37,0.0 +gfx950,256,8192,17408,4096,cktile,11,0,617.9339,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1890.54,631.25,0.0 +gfx950,256,8192,20480,4096,cktile,11,0,708.3984,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1940.14,639.45,0.0 +gfx950,256,16384,16,4096,cktile,7,0,37.8328,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,56.76,1789.42,0.0 +gfx950,256,16384,32,4096,cktile,7,0,38.4387,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,111.74,1776.56,0.0 +gfx950,256,16384,256,512,ck,1,0,10.5907,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,405.54,1596.52,0.0 +gfx950,256,16384,512,256,ck,9,0,12.9271,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,332.25,1632.43,0.0 +gfx950,256,16384,512,512,ck,16,0,16.0509,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,535.17,1584.21,0.0 +gfx950,256,16384,512,1024,ck,13,0,20.9235,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,821.08,1628.73,0.0 +gfx950,256,16384,512,2048,ck,13,0,29.9784,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1146.15,1713.91,0.0 +gfx950,256,16384,1024,256,ck,16,0,21.154,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,406.07,1796.86,0.0 +gfx950,256,16384,1024,512,ck,16,0,25.8117,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,665.58,1645.27,0.0 +gfx950,256,16384,1024,1024,ck,14,0,38.251,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,898.27,1343.24,0.0003 +gfx950,256,16384,1024,2048,ck,13,0,55.8164,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1231.17,1239.89,0.0 +gfx950,256,16384,1024,4096,ck,13,0,92.3826,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1487.71,1135.04,0.0 +gfx950,256,16384,1152,2048,ck,14,0,67.0848,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1152.41,1098.05,0.0004 +gfx950,256,16384,1536,2048,ck,13,0,78.1257,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1319.4,1114.0,0.0 +gfx950,256,16384,2048,512,ck,16,0,45.9039,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,748.51,1667.53,0.0 +gfx950,256,16384,2048,1024,ck,13,0,67.9688,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1011.04,1265.04,0.0 +gfx950,256,16384,2048,2048,ck,13,0,100.8975,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1362.16,1039.25,0.0 +gfx950,256,16384,2048,4096,ck,13,0,172.1554,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1596.68,828.36,0.0 +gfx950,256,16384,2176,4096,ck,0,0,200.586,a8w8_blockscale_bpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,1456.02,734.47,0.0 +gfx950,256,16384,2304,2048,ck,13,0,113.7928,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1358.78,999.8,0.0 +gfx950,256,16384,2560,4096,ck,13,0,207.1913,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1658.36,779.38,0.0 +gfx950,256,16384,3072,2048,ck,13,0,143.5179,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1436.46,979.04,0.0 +gfx950,256,16384,4096,1024,ck,13,0,131.4292,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1045.73,1180.78,0.0 +gfx950,256,16384,4096,2048,ck,13,0,187.6366,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1464.95,938.84,0.0 +gfx950,256,16384,4352,4096,ck,13,0,337.0876,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1732.83,675.02,0.0 +gfx950,256,16384,4608,2048,ck,13,0,201.776,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1532.58,961.4,0.0 +gfx950,256,16384,4608,4096,cktile,28,0,334.3939,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1849.54,708.68,0.0 +gfx950,256,16384,5120,4096,cktile,27,0,385.5837,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1782.22,663.55,0.0 +gfx950,256,16384,6144,2048,ck,13,0,269.0614,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1532.43,919.73,0.0 +gfx950,256,16384,8704,4096,cktile,26,0,625.4187,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1867.92,620.34,0.0 +gfx950,256,16384,9216,2048,ck,13,0,387.8534,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1594.61,913.8,0.0 +gfx950,256,16384,10240,4096,cktile,27,0,704.2104,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1951.67,631.34,0.0 +gfx950,256,16384,12288,2048,ck,13,0,504.5565,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1634.37,914.41,0.0 +gfx950,256,16384,17408,4096,ck,13,0,1208.8428,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1932.81,586.38,0.0 +gfx950,256,16384,20480,4096,cktile,27,0,1387.4271,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1981.21,592.52,0.0 +gfx950,256,32768,16,4096,cktile,19,0,55.9249,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_4,76.8,2419.89,0.0 +gfx950,256,32768,32,4096,cktile,18,0,56.1714,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_3,152.92,2429.1,0.0 +gfx950,256,32768,256,512,ck,16,0,16.1398,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,532.22,2087.11,0.0 +gfx950,256,32768,512,256,ck,16,0,21.12,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,406.72,1992.15,0.0 +gfx950,256,32768,512,512,ck,16,0,27.1113,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,633.68,1866.15,0.0 +gfx950,256,32768,512,1024,ck,14,0,38.8109,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,885.31,1742.63,0.0004 +gfx950,256,32768,512,2048,ck,13,0,56.8603,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1208.57,1788.8,0.0 +gfx950,256,32768,1024,256,ck,14,0,36.5227,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,470.39,2074.32,0.0 +gfx950,256,32768,1024,512,ck,16,0,46.8431,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,733.51,1801.98,0.0 +gfx950,256,32768,1024,1024,ck,13,0,70.7661,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,971.08,1437.3,0.0 +gfx950,256,32768,1024,2048,ck,13,0,103.0712,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1333.44,1322.53,0.0 +gfx950,256,32768,1024,4096,ck,13,0,172.4385,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1594.06,1191.85,0.0 +gfx950,256,32768,1152,2048,ck,14,0,122.1313,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1266.0,1186.97,0.0004 +gfx950,256,32768,1536,2048,ck,13,0,141.9473,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1452.36,1204.09,0.0 +gfx950,256,32768,2048,512,ck,16,0,85.8103,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,800.83,1771.86,0.0 +gfx950,256,32768,2048,1024,ck,13,0,131.0786,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1048.52,1295.93,0.0 +gfx950,256,32768,2048,2048,ck,13,0,195.3896,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1406.82,1051.85,0.0 +gfx950,256,32768,2048,4096,ck,13,0,336.668,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1632.93,822.25,0.0 +gfx950,256,32768,2176,4096,ck,0,0,361.4039,a8w8_blockscale_bpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,1616.24,790.63,0.0 +gfx950,256,32768,2304,2048,ck,13,0,210.3495,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1470.11,1059.3,0.0 +gfx950,256,32768,2560,4096,ck,13,0,392.3196,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1751.62,796.48,0.0 +gfx950,256,32768,3072,2048,ck,13,0,264.8182,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1556.98,1037.42,0.0 +gfx950,256,32768,4096,1024,ck,14,0,238.3201,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1153.4,1284.76,0.0004 +gfx950,256,32768,4096,2048,ck,13,1,345.9207,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1589.25,994.25,0.0 +gfx950,256,32768,4352,4096,cktile,28,0,623.3898,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1874.0,701.42,0.0 +gfx950,256,32768,4608,2048,ck,13,0,393.975,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1569.83,960.81,0.0 +gfx950,256,32768,4608,4096,cktile,26,0,663.6484,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1863.86,685.73,0.0 +gfx950,256,32768,5120,4096,ck,13,0,739.9087,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1857.51,663.24,0.0 +gfx950,256,32768,6144,2048,ck,13,0,513.6492,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1605.44,939.06,0.0 +gfx950,256,32768,8704,4096,cktile,26,0,1226.8874,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1904.38,603.39,0.0 +gfx950,256,32768,9216,2048,ck,13,0,741.7658,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1667.58,930.16,0.0 +gfx950,256,32768,10240,4096,cktile,28,0,1404.9529,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1956.49,603.04,0.0 +gfx950,256,32768,12288,2048,ck,13,0,990.9729,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1664.29,905.76,0.0 +gfx950,256,32768,17408,4096,cktile,27,0,2389.7849,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1955.37,563.39,0.0 +gfx950,256,32768,20480,4096,cktile,28,0,2721.3003,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2020.2,573.36,0.0 diff --git a/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_qwen3_235b.csv b/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_qwen3_235b.csv index 5957a8be0b..aba5670813 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_qwen3_235b.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_qwen3_235b.csv @@ -1,88 +1,88 @@ -cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,1,1280,4096,ck,7,0,12.9489,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.81,405.4,0.0 -256,2,1280,4096,ck,7,0,13.0111,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.61,403.98,0.0 -256,4,1280,4096,ck,7,0,12.855,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.26,409.92,0.0 -256,8,1280,4096,ck,7,0,13.1195,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.39,403.68,0.0 -256,16,1280,4096,ck,7,0,12.1515,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.81,440.22,0.0 -256,32,1280,4096,ck,7,0,12.0636,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,27.81,452.26,0.0 -256,64,1280,4096,ck,7,0,11.9591,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,56.12,474.02,0.0 -256,128,1280,4096,ck,7,0,11.795,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,113.79,516.73,0.0 -256,256,1280,4096,ck,12,0,12.6183,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,212.74,550.54,0.0 -256,512,1280,4096,ck,17,0,14.6566,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,366.3,590.23,0.0 -256,1024,1280,4096,ck,17,0,21.4933,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,499.57,561.04,0.0 -256,2048,1280,4096,ck,14,0,30.3452,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,707.68,621.99,0.0001 -256,4096,1280,4096,ck,13,0,43.6628,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,983.67,744.47,0.0 -256,8192,1280,4096,cktile,26,0,60.8285,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1412.16,982.58,0.0 -256,16384,1280,4096,cktile,11,0,129.3201,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1328.48,883.81,0.0 -256,32768,1280,4096,cktile,11,0,245.7508,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1398.15,908.83,0.0 -256,1,2304,4096,ck,7,0,12.8681,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.47,734.05,0.0 -256,2,2304,4096,ck,7,0,12.9331,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.92,731.04,0.0 -256,4,2304,4096,ck,7,0,12.8197,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.89,738.86,0.0 -256,8,2304,4096,ck,7,0,12.8411,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.76,740.34,0.0 -256,16,2304,4096,ck,7,0,11.8579,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.47,807.6,0.0 -256,32,2304,4096,ck,7,0,11.9991,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.34,809.7,0.0 -256,64,2304,4096,ck,7,0,12.0375,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,100.35,830.26,0.0 -256,128,2304,4096,ck,12,0,12.7292,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,189.79,828.9,0.0 -256,256,2304,4096,ck,17,0,14.8289,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,325.84,786.67,0.0 -256,512,2304,4096,ck,17,0,21.0801,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,458.43,659.09,0.0 -256,1024,2304,4096,ck,2,0,29.486,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,655.48,622.33,0.0 -256,2048,2304,4096,ck,14,0,43.9597,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,879.32,620.18,0.0001 -256,4096,2304,4096,cktile,28,0,59.1769,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1306.41,761.93,0.0 -256,8192,2304,4096,cktile,11,0,124.9288,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1237.66,646.29,0.0 -256,16384,2304,4096,cktile,11,0,218.1091,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1417.81,697.1,0.0 -256,32768,2304,4096,cktile,27,0,353.6351,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1748.91,833.2,0.0 -256,1,4096,1024,ck,7,0,5.355,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.57,784.97,0.0 -256,1,4096,4096,ck,7,0,12.6324,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.66,1329.08,0.0 -256,1,4096,8192,ck,7,0,23.6611,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.84,1418.82,0.0 -256,2,4096,1024,ck,7,0,5.3046,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.16,794.17,0.0 -256,2,4096,4096,ck,7,0,12.3902,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.42,1356.05,0.0 -256,2,4096,8192,ck,7,0,23.5334,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.7,1427.21,0.0 -256,4,4096,4096,ck,7,0,12.5586,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.69,1339.83,0.0 -256,4,4096,8192,ck,7,0,23.3798,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.48,1437.99,0.0 -256,8,4096,1024,ck,7,0,5.3754,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.48,793.99,0.0 -256,8,4096,4096,ck,7,0,12.5781,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.34,1341.66,0.0 -256,8,4096,8192,ck,7,0,23.5614,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.79,1429.69,0.0 -256,16,4096,4096,ck,7,0,11.6041,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.27,1462.74,0.0 -256,16,4096,8192,ck,7,0,21.0936,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.9,1603.17,0.0 -256,32,4096,1024,ck,7,0,5.554,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,48.33,808.29,0.0 -256,32,4096,4096,ck,7,0,11.5906,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,92.64,1481.41,0.0 -256,32,4096,8192,ck,7,0,21.1951,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,101.32,1607.86,0.0 -256,64,4096,1024,ck,7,0,5.5953,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,95.95,855.03,0.0 -256,64,4096,4096,ck,7,0,11.5495,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,185.94,1520.73,0.0 -256,64,4096,8192,ck,7,0,20.9352,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,205.16,1652.86,0.0 -256,128,4096,4096,ck,12,0,12.5223,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,342.99,1465.39,0.0 -256,128,4096,8192,ck,12,0,22.6226,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,379.71,1575.93,0.0 -256,256,4096,4096,ck,17,0,14.9257,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,575.51,1334.81,0.0 -256,256,4096,8192,ck,17,0,27.0469,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,635.19,1395.68,0.0 -256,512,4096,1024,ck,2,0,9.3277,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,460.45,955.53,0.0 -256,512,4096,4096,ck,14,0,23.4392,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,732.95,984.19,0.0 -256,512,4096,8192,ck,17,0,45.8497,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,749.4,914.79,0.0 -256,1024,4096,4096,ck,13,0,34.5604,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,994.19,849.53,0.0 -256,1024,4096,8192,ck,13,0,65.123,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1055.23,772.87,0.0 -256,2048,4096,4096,ck,13,0,58.305,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1178.62,719.37,0.0 -256,2048,4096,8192,cktile,11,0,101.1121,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1359.27,663.71,0.0 -256,4096,4096,4096,ck,13,0,110.1621,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1247.61,609.18,0.0 -256,4096,4096,8192,ck,13,0,195.9346,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1402.91,513.76,0.0 -256,8192,4096,4096,cktile,11,0,184.2345,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1492.0,637.45,0.0 -256,8192,4096,8192,cktile,26,0,291.554,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1885.61,575.44,0.0 -256,16384,4096,4096,cktile,11,0,352.0458,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1561.6,619.53,0.0 -256,16384,4096,8192,cktile,28,0,567.6935,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1936.81,531.96,0.0 -256,32768,4096,4096,cktile,28,0,621.3209,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1769.64,675.06,0.0 -256,32768,4096,8192,cktile,11,0,1063.8445,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2067.05,536.19,0.0 -256,16,4608,4096,ck,7,0,12.0322,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.2,1586.36,0.0 -256,1,9216,4096,ck,7,0,14.1275,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.34,2673.6,0.0 -256,2,9216,4096,ck,7,0,14.1631,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.66,2668.47,0.0 -256,4,9216,4096,ck,7,0,14.127,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.38,2678.48,0.0 -256,8,9216,4096,ck,7,0,14.2075,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,42.51,2669.64,0.0 -256,16,9216,4096,ck,7,0,13.0948,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,92.25,2910.25,0.0 -256,32,9216,4096,ck,12,0,14.1195,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,171.11,2724.57,0.0 -256,64,9216,4096,ck,17,0,16.2567,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,297.22,2410.73,0.0 -256,128,9216,4096,ck,17,0,23.1218,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,417.95,1757.32,0.0 -256,256,9216,4096,ck,14,0,30.5777,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,632.07,1423.13,0.0 -256,512,9216,4096,ck,14,0,44.6573,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,865.59,1103.58,0.0002 -256,1024,9216,4096,cktile,26,0,57.4495,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1345.69,1058.62,0.0 -256,2048,9216,4096,cktile,11,0,117.8066,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1312.48,712.07,0.0 -256,4096,9216,4096,cktile,11,0,201.5236,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1534.5,645.2,0.0 -256,8192,9216,4096,cktile,28,0,338.5642,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1826.76,656.59,0.0 -256,16384,9216,4096,cktile,26,0,648.1251,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1908.51,627.73,0.0 -256,32768,9216,4096,cktile,27,0,1266.6884,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1953.05,612.58,0.0 +gfx,cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,1,1280,4096,ck,7,0,12.9489,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.81,405.4,0.0 +gfx950,256,2,1280,4096,ck,7,0,13.0111,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.61,403.98,0.0 +gfx950,256,4,1280,4096,ck,7,0,12.855,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.26,409.92,0.0 +gfx950,256,8,1280,4096,ck,7,0,13.1195,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.39,403.68,0.0 +gfx950,256,16,1280,4096,ck,7,0,12.1515,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.81,440.22,0.0 +gfx950,256,32,1280,4096,ck,7,0,12.0636,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,27.81,452.26,0.0 +gfx950,256,64,1280,4096,ck,7,0,11.9591,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,56.12,474.02,0.0 +gfx950,256,128,1280,4096,ck,7,0,11.795,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,113.79,516.73,0.0 +gfx950,256,256,1280,4096,ck,12,0,12.6183,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,212.74,550.54,0.0 +gfx950,256,512,1280,4096,ck,17,0,14.6566,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,366.3,590.23,0.0 +gfx950,256,1024,1280,4096,ck,17,0,21.4933,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,499.57,561.04,0.0 +gfx950,256,2048,1280,4096,ck,14,0,30.3452,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,707.68,621.99,0.0001 +gfx950,256,4096,1280,4096,ck,13,0,43.6628,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,983.67,744.47,0.0 +gfx950,256,8192,1280,4096,cktile,26,0,60.8285,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1412.16,982.58,0.0 +gfx950,256,16384,1280,4096,cktile,11,0,129.3201,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1328.48,883.81,0.0 +gfx950,256,32768,1280,4096,cktile,11,0,245.7508,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1398.15,908.83,0.0 +gfx950,256,1,2304,4096,ck,7,0,12.8681,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.47,734.05,0.0 +gfx950,256,2,2304,4096,ck,7,0,12.9331,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.92,731.04,0.0 +gfx950,256,4,2304,4096,ck,7,0,12.8197,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.89,738.86,0.0 +gfx950,256,8,2304,4096,ck,7,0,12.8411,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.76,740.34,0.0 +gfx950,256,16,2304,4096,ck,7,0,11.8579,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.47,807.6,0.0 +gfx950,256,32,2304,4096,ck,7,0,11.9991,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.34,809.7,0.0 +gfx950,256,64,2304,4096,ck,7,0,12.0375,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,100.35,830.26,0.0 +gfx950,256,128,2304,4096,ck,12,0,12.7292,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,189.79,828.9,0.0 +gfx950,256,256,2304,4096,ck,17,0,14.8289,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,325.84,786.67,0.0 +gfx950,256,512,2304,4096,ck,17,0,21.0801,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,458.43,659.09,0.0 +gfx950,256,1024,2304,4096,ck,2,0,29.486,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,655.48,622.33,0.0 +gfx950,256,2048,2304,4096,ck,14,0,43.9597,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,879.32,620.18,0.0001 +gfx950,256,4096,2304,4096,cktile,28,0,59.1769,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1306.41,761.93,0.0 +gfx950,256,8192,2304,4096,cktile,11,0,124.9288,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1237.66,646.29,0.0 +gfx950,256,16384,2304,4096,cktile,11,0,218.1091,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1417.81,697.1,0.0 +gfx950,256,32768,2304,4096,cktile,27,0,353.6351,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1748.91,833.2,0.0 +gfx950,256,1,4096,1024,ck,7,0,5.355,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.57,784.97,0.0 +gfx950,256,1,4096,4096,ck,7,0,12.6324,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.66,1329.08,0.0 +gfx950,256,1,4096,8192,ck,7,0,23.6611,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.84,1418.82,0.0 +gfx950,256,2,4096,1024,ck,7,0,5.3046,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.16,794.17,0.0 +gfx950,256,2,4096,4096,ck,7,0,12.3902,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.42,1356.05,0.0 +gfx950,256,2,4096,8192,ck,7,0,23.5334,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.7,1427.21,0.0 +gfx950,256,4,4096,4096,ck,7,0,12.5586,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.69,1339.83,0.0 +gfx950,256,4,4096,8192,ck,7,0,23.3798,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.48,1437.99,0.0 +gfx950,256,8,4096,1024,ck,7,0,5.3754,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.48,793.99,0.0 +gfx950,256,8,4096,4096,ck,7,0,12.5781,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.34,1341.66,0.0 +gfx950,256,8,4096,8192,ck,7,0,23.5614,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.79,1429.69,0.0 +gfx950,256,16,4096,4096,ck,7,0,11.6041,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.27,1462.74,0.0 +gfx950,256,16,4096,8192,ck,7,0,21.0936,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.9,1603.17,0.0 +gfx950,256,32,4096,1024,ck,7,0,5.554,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,48.33,808.29,0.0 +gfx950,256,32,4096,4096,ck,7,0,11.5906,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,92.64,1481.41,0.0 +gfx950,256,32,4096,8192,ck,7,0,21.1951,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,101.32,1607.86,0.0 +gfx950,256,64,4096,1024,ck,7,0,5.5953,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,95.95,855.03,0.0 +gfx950,256,64,4096,4096,ck,7,0,11.5495,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,185.94,1520.73,0.0 +gfx950,256,64,4096,8192,ck,7,0,20.9352,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,205.16,1652.86,0.0 +gfx950,256,128,4096,4096,ck,12,0,12.5223,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,342.99,1465.39,0.0 +gfx950,256,128,4096,8192,ck,12,0,22.6226,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,379.71,1575.93,0.0 +gfx950,256,256,4096,4096,ck,17,0,14.9257,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,575.51,1334.81,0.0 +gfx950,256,256,4096,8192,ck,17,0,27.0469,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,635.19,1395.68,0.0 +gfx950,256,512,4096,1024,ck,2,0,9.3277,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,460.45,955.53,0.0 +gfx950,256,512,4096,4096,ck,14,0,23.4392,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,732.95,984.19,0.0 +gfx950,256,512,4096,8192,ck,17,0,45.8497,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,749.4,914.79,0.0 +gfx950,256,1024,4096,4096,ck,13,0,34.5604,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,994.19,849.53,0.0 +gfx950,256,1024,4096,8192,ck,13,0,65.123,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1055.23,772.87,0.0 +gfx950,256,2048,4096,4096,ck,13,0,58.305,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1178.62,719.37,0.0 +gfx950,256,2048,4096,8192,cktile,11,0,101.1121,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1359.27,663.71,0.0 +gfx950,256,4096,4096,4096,ck,13,0,110.1621,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1247.61,609.18,0.0 +gfx950,256,4096,4096,8192,ck,13,0,195.9346,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1402.91,513.76,0.0 +gfx950,256,8192,4096,4096,cktile,11,0,184.2345,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1492.0,637.45,0.0 +gfx950,256,8192,4096,8192,cktile,26,0,291.554,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1885.61,575.44,0.0 +gfx950,256,16384,4096,4096,cktile,11,0,352.0458,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1561.6,619.53,0.0 +gfx950,256,16384,4096,8192,cktile,28,0,567.6935,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1936.81,531.96,0.0 +gfx950,256,32768,4096,4096,cktile,28,0,621.3209,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1769.64,675.06,0.0 +gfx950,256,32768,4096,8192,cktile,11,0,1063.8445,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2067.05,536.19,0.0 +gfx950,256,16,4608,4096,ck,7,0,12.0322,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.2,1586.36,0.0 +gfx950,256,1,9216,4096,ck,7,0,14.1275,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.34,2673.6,0.0 +gfx950,256,2,9216,4096,ck,7,0,14.1631,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.66,2668.47,0.0 +gfx950,256,4,9216,4096,ck,7,0,14.127,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.38,2678.48,0.0 +gfx950,256,8,9216,4096,ck,7,0,14.2075,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,42.51,2669.64,0.0 +gfx950,256,16,9216,4096,ck,7,0,13.0948,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,92.25,2910.25,0.0 +gfx950,256,32,9216,4096,ck,12,0,14.1195,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,171.11,2724.57,0.0 +gfx950,256,64,9216,4096,ck,17,0,16.2567,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,297.22,2410.73,0.0 +gfx950,256,128,9216,4096,ck,17,0,23.1218,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,417.95,1757.32,0.0 +gfx950,256,256,9216,4096,ck,14,0,30.5777,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,632.07,1423.13,0.0 +gfx950,256,512,9216,4096,ck,14,0,44.6573,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,865.59,1103.58,0.0002 +gfx950,256,1024,9216,4096,cktile,26,0,57.4495,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1345.69,1058.62,0.0 +gfx950,256,2048,9216,4096,cktile,11,0,117.8066,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1312.48,712.07,0.0 +gfx950,256,4096,9216,4096,cktile,11,0,201.5236,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1534.5,645.2,0.0 +gfx950,256,8192,9216,4096,cktile,28,0,338.5642,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1826.76,656.59,0.0 +gfx950,256,16384,9216,4096,cktile,26,0,648.1251,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1908.51,627.73,0.0 +gfx950,256,32768,9216,4096,cktile,27,0,1266.6884,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1953.05,612.58,0.0 diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_ds_v3.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_ds_v3.csv index 82a9644838..f2821e0d1f 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_ds_v3.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_ds_v3.csv @@ -1,1021 +1,1021 @@ -cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,1,128,7168,ck,8,0,15.3942,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.12,60.08,0.0 -256,1,512,7168,ck,8,0,22.2417,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.33,165.37,0.0 -256,1,1024,7168,ck,8,0,22.7838,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.64,322.56,0.0 -256,1,2112,7168,ck,8,0,22.4396,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.35,675.15,0.0 -256,1,2240,7168,ck,8,0,22.4289,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.43,716.4,0.0 -256,1,3072,1536,ck,8,0,6.5611,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.44,720.35,0.0 -256,1,4096,512,cktile,1,0,3.3013,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,1.27,637.89,0.0 -256,1,4096,7168,ck,8,0,22.7643,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.58,1290.42,0.0 -256,1,4608,7168,ck,8,0,22.7223,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.91,1454.37,0.0 -256,1,7168,256,ck,8,0,2.9182,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.26,633.82,0.0 -256,1,7168,512,ck,13,0,4.049,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,1.81,910.07,0.0 -256,1,7168,2048,ck,8,0,7.9966,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.67,1837.84,0.0 -256,1,7168,2304,ck,8,0,9.5203,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.47,1736.47,0.0 -256,1,7168,4096,ck,8,0,14.5736,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.03,2015.88,0.0 -256,1,7168,4608,ck,8,0,15.7127,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.2,2103.34,0.0 -256,1,7168,16384,ck,8,0,50.6751,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.64,2318.13,0.0 -256,1,7168,18432,ck,8,0,57.4061,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.6,2302.08,0.0 -256,1,8192,512,ck,13,0,4.0954,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,2.05,1028.28,0.0 -256,1,8192,1536,ck,8,0,6.2908,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.0,2003.06,0.0 -256,1,9216,7168,ck,8,0,24.1893,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.46,2732.03,0.0 -256,1,11264,1536,ck,8,0,7.5534,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.58,2293.74,0.0 -256,1,12288,1536,ck,8,0,7.7799,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.85,2429.4,0.0 -256,1,14336,1536,ck,8,0,8.0858,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.45,2727.04,0.0 -256,1,16384,512,ck,8,0,4.6738,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.59,1801.94,0.0 -256,1,20480,1536,ck,7,0,8.6493,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,7.27,3641.89,0.0 -256,1,24576,1536,ck,8,0,9.3079,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.11,4061.0,0.0 -256,1,32768,512,ck,8,0,5.2866,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.35,3186.03,0.0 -256,1,32768,1536,ck,8,0,11.0399,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.12,4565.14,0.0 -256,1,36864,7168,ck,8,0,51.5672,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.25,5125.78,0.0 -256,2,128,7168,ck,8,0,15.3762,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.24,60.64,0.0 -256,2,512,7168,ck,8,0,22.2604,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.66,165.6,0.0 -256,2,1024,7168,ck,8,0,22.8521,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.28,322.0,0.0 -256,2,2112,7168,ck,8,0,22.4493,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.7,675.37,0.0 -256,2,2240,7168,ck,8,0,22.4022,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.87,717.77,0.0 -256,2,3072,1536,ck,8,0,6.4642,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.92,732.33,0.0 -256,2,4096,512,ck,8,0,3.401,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.47,621.75,0.0 -256,2,4096,7168,ck,8,0,22.7895,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.15,1289.67,0.0 -256,2,4608,7168,ck,8,0,22.751,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.81,1453.25,0.0 -256,2,7168,256,ck,8,0,2.908,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.52,641.06,0.0 -256,2,7168,512,ck,13,0,4.0902,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,3.59,904.53,0.0 -256,2,7168,2048,ck,8,0,8.4117,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.98,1749.09,0.0 -256,2,7168,2304,ck,8,0,8.8052,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.5,1879.38,0.0 -256,2,7168,4096,ck,8,0,14.5727,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.06,2017.26,0.0 -256,2,7168,4608,ck,8,0,15.7396,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.39,2100.94,0.0 -256,2,7168,16384,ck,8,0,50.6878,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.27,2318.15,0.0 -256,2,7168,18432,ck,8,0,57.2045,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.24,2310.76,0.0 -256,2,8192,512,ck,13,0,4.117,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,4.08,1026.98,0.0 -256,2,8192,1536,ck,8,0,7.3302,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.87,1721.47,0.0 -256,2,9216,7168,ck,8,0,24.268,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.89,2724.22,0.0 -256,2,11264,1536,ck,8,0,6.4805,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.68,2677.21,0.0 -256,2,12288,1536,ck,8,0,6.5398,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.54,2894.06,0.0 -256,2,14336,1536,ck,8,0,8.0887,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.89,2729.8,0.0 -256,2,16384,512,ck,8,0,4.7294,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.09,1787.79,0.0 -256,2,20480,1536,ck,7,0,8.6736,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,14.51,3636.58,0.0 -256,2,24576,1536,ck,8,0,9.316,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.21,4062.91,0.0 -256,2,32768,512,ck,8,0,5.3243,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.6,3175.88,0.0 -256,2,32768,1536,ck,8,0,11.0733,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.18,4557.43,0.0 -256,2,36864,7168,ck,8,0,51.4438,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.55,5139.65,0.0 -256,4,128,7168,ck,8,0,15.4615,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.47,61.26,0.0 -256,4,512,7168,ck,8,0,22.3068,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.32,165.99,0.0 -256,4,1024,7168,ck,8,0,22.8322,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.57,323.09,0.0 -256,4,2112,7168,ck,8,0,22.473,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.39,675.67,0.0 -256,4,2240,7168,ck,8,0,22.434,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.73,717.79,0.0 -256,4,3072,1536,ck,8,0,6.7718,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.57,701.34,0.0 -256,4,4096,512,cktile,0,0,3.2399,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,5.18,658.04,0.0 -256,4,4096,7168,ck,8,0,22.7401,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.33,1293.82,0.0 -256,4,4608,7168,ck,8,0,22.8359,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.57,1449.28,0.0 -256,4,7168,256,ck,8,0,2.9298,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.01,646.25,0.0 -256,4,7168,512,ck,13,0,4.0665,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,7.22,917.11,0.0 -256,4,7168,2048,ck,8,0,8.3038,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.14,1775.77,0.0 -256,4,7168,2304,ck,8,0,8.8224,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.98,1879.49,0.0 -256,4,7168,4096,ck,8,0,14.5954,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.09,2016.65,0.0 -256,4,7168,4608,ck,8,0,15.7107,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.82,2107.22,0.0 -256,4,7168,16384,ck,8,0,50.7328,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.52,2317.31,0.0 -256,4,7168,18432,ck,8,0,57.5155,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.38,2299.41,0.0 -256,4,8192,512,ck,13,0,4.1178,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,8.15,1034.99,0.0 -256,4,8192,1536,ck,8,0,6.5931,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,15.27,1919.37,0.0 -256,4,9216,7168,ck,8,0,24.2612,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.78,2727.1,0.0 -256,4,11264,1536,ck,8,0,6.5359,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.18,2661.88,0.0 -256,4,12288,1536,ck,8,0,7.7422,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.5,2451.35,0.0 -256,4,14336,1536,ck,8,0,8.0317,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.93,2756.69,0.0 -256,4,16384,512,ck,8,0,4.6944,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.3,1815.3,0.0 -256,4,20480,1536,ck,7,0,8.7372,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,28.8,3619.84,0.0 -256,4,24576,1536,ck,8,0,9.41,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,32.09,4033.1,0.0 -256,4,32768,512,ck,8,0,5.3284,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.19,3198.22,0.0 -256,4,32768,1536,ck,8,0,11.1225,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.2,4549.33,0.0 -256,4,36864,7168,ck,8,0,51.9106,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.72,5096.55,0.0 -256,8,128,7168,ck,8,0,15.5627,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.94,62.77,0.0 -256,8,512,7168,ck,8,0,22.3452,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.63,167.17,0.0 -256,8,1024,7168,ck,8,0,22.9843,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.11,322.56,0.0 -256,8,2112,7168,ck,8,0,22.4589,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.79,678.13,0.0 -256,8,2240,7168,ck,8,0,22.4802,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.43,718.39,0.0 -256,8,3072,1536,ck,8,0,6.482,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.65,737.43,0.0 -256,8,4096,512,ck,8,0,3.4272,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.79,632.23,0.0 -256,8,4096,7168,ck,8,0,22.9113,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.5,1286.83,0.0 -256,8,4608,7168,ck,8,0,22.808,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.17,1453.93,0.0 -256,8,7168,256,ck,8,0,2.9146,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.07,669.64,0.0 -256,8,7168,512,ck,13,0,4.1122,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,14.28,921.36,0.0 -256,8,7168,2048,ck,8,0,8.1857,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,28.69,1809.39,0.0 -256,8,7168,2304,ck,8,0,8.8267,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,29.94,1886.12,0.0 -256,8,7168,4096,ck,8,0,14.65,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,32.07,2014.17,0.0 -256,8,7168,4608,ck,8,0,15.796,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,33.46,2100.64,0.0 -256,8,7168,16384,ck,8,0,50.8429,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.96,2314.7,0.0 -256,8,7168,18432,ck,8,0,57.4158,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.82,2305.68,0.0 -256,8,8192,512,ck,13,0,4.1533,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,16.16,1042.42,0.0 -256,8,8192,1536,ck,8,0,6.4583,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,31.17,1970.53,0.0 -256,8,9216,7168,ck,8,0,24.3102,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.48,2725.81,0.0 -256,8,11264,1536,ck,8,0,6.624,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.79,2641.0,0.0 -256,8,12288,1536,ck,8,0,7.0261,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,42.98,2716.05,0.0 -256,8,14336,1536,ck,8,0,8.0884,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.56,2752.31,0.0 -256,8,16384,512,ck,8,0,4.6609,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,28.8,1856.9,0.0 -256,8,20480,1536,ck,7,0,8.8663,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,56.77,3586.3,0.0 -256,8,24576,1536,ck,8,0,9.4837,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,63.69,4023.14,0.0 -256,8,32768,512,ck,8,0,5.3293,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.37,3247.26,0.0 -256,8,32768,1536,ck,8,0,11.1791,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,72.04,4550.3,0.0 -256,8,36864,7168,ck,8,0,52.7222,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,80.19,5024.23,0.0 -256,16,64,7168,ck,8,0,13.753,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.07,41.84,0.0 -256,16,128,7168,ck,8,0,13.8217,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.12,74.98,0.0 -256,16,512,7168,ck,8,0,20.9785,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.6,181.19,0.0 -256,16,576,7168,ck,8,0,21.1695,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.24,201.32,0.0 -256,16,1024,7168,ck,8,0,21.4879,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.93,348.45,0.0 -256,16,1536,7168,ck,8,0,21.2635,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.57,525.5,0.0 -256,16,2112,7168,ck,8,0,21.0289,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.04,728.57,0.0 -256,16,2240,7168,ck,8,0,21.0862,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,24.37,770.3,0.0 -256,16,3072,1536,ck,8,0,6.5418,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.08,740.08,0.0 -256,16,4096,512,cktile,3,0,3.3118,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_2,20.26,675.29,0.0 -256,16,4096,7168,ck,8,0,21.5063,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.69,1376.61,0.0 -256,16,4608,7168,ck,8,0,21.4253,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.33,1553.88,0.0 -256,16,7168,256,ck,8,0,2.9668,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.79,697.21,0.0 -256,16,7168,512,ck,8,0,4.1538,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,28.27,940.73,0.0 -256,16,7168,2048,ck,8,0,8.5525,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.93,1747.12,0.0 -256,16,7168,2304,ck,8,0,8.5986,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,61.46,1951.63,0.0 -256,16,7168,4096,ck,8,0,14.2602,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,65.88,2079.57,0.0 -256,16,7168,4608,ck,8,0,14.92,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,70.84,2234.13,0.0 -256,16,7168,16384,ck,8,0,49.2174,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,76.36,2396.15,0.0 -256,16,7168,18432,ck,8,0,54.4636,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,77.63,2435.48,0.0 -256,16,8192,512,ck,8,0,4.1546,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,32.31,1074.63,0.0 -256,16,8192,1536,ck,8,0,6.991,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,57.6,1840.89,0.0 -256,16,9216,7168,ck,8,0,23.3394,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,90.57,2847.97,0.0 -256,16,11264,1536,ck,8,0,6.5361,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,84.71,2705.98,0.0 -256,16,12288,1536,ck,8,0,7.8318,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,77.12,2463.31,0.0 -256,16,14336,1536,ck,8,0,8.0559,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,87.47,2793.41,0.0 -256,16,16384,512,ck,8,0,4.6334,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,57.93,1925.39,0.0 -256,16,20480,1536,ck,7,0,9.1661,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,109.82,3506.09,0.0 -256,16,24576,1536,ck,8,0,9.7004,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,124.53,3975.07,0.0 -256,16,32768,512,ck,7,0,5.5119,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,97.4,3235.54,0.0 -256,16,32768,1536,ck,8,0,11.4461,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,140.71,4491.03,0.0 -256,16,36864,7168,ck,8,0,53.9674,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,156.68,4920.29,0.0 -256,32,64,7168,ck,8,0,13.8569,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.12,49.96,0.0 -256,32,128,7168,ck,8,0,14.288,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.11,80.84,0.0 -256,32,512,7168,ck,8,0,20.8284,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.28,188.79,0.0 -256,32,576,7168,ck,8,0,20.9741,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.6,209.54,0.0 -256,32,1024,7168,ck,8,0,21.3011,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.05,358.43,0.0 -256,32,1536,7168,ck,8,0,21.328,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,33.04,531.59,0.0 -256,32,2112,7168,ck,8,0,20.8386,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.49,743.97,0.0 -256,32,2240,7168,ck,8,0,20.8898,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.19,786.46,0.0 -256,32,3072,1536,ck,8,0,6.5986,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,45.77,752.33,0.0 -256,32,4096,512,ck,13,0,3.5718,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,37.58,665.12,0.0 -256,32,4096,7168,ck,8,0,21.3382,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,88.06,1398.98,0.0 -256,32,4608,7168,ck,8,0,21.3536,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,99.0,1571.37,0.0 -256,32,7168,256,ck,13,0,3.239,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,36.26,710.7,0.0 -256,32,7168,512,ck,13,0,4.2078,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,55.82,985.11,0.0 -256,32,7168,2048,ck,8,0,8.0391,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,116.87,1891.3,0.0 -256,32,7168,2304,ck,8,0,8.7013,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,121.47,1959.2,0.0 -256,32,7168,4096,ck,8,0,14.1521,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,132.78,2116.29,0.0 -256,32,7168,4608,ck,8,0,14.9953,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,140.97,2243.13,0.0 -256,32,7168,16384,ck,8,0,48.4226,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,155.22,2445.63,0.0 -256,32,7168,18432,ck,8,0,54.0238,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,156.52,2465.01,0.0 -256,32,8192,512,ck,13,0,4.2239,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,63.55,1121.0,0.0 -256,32,8192,1536,ck,8,0,6.8782,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,117.08,1912.76,0.0 -256,32,9216,7168,ck,7,0,25.0083,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,169.06,2674.29,0.0 -256,32,11264,1536,ck,18,0,8.6616,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,127.84,2086.4,0.0 -256,32,12288,1536,ck,18,0,8.3515,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,144.64,2360.05,0.0 -256,32,14336,1536,ck,7,0,8.7538,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,160.99,2625.92,0.0 -256,32,16384,512,ck,13,0,4.8406,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,110.91,1952.97,0.0 -256,32,20480,1536,ck,12,0,10.5564,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,190.72,3108.74,0.0 -256,32,24576,1536,ck,12,0,11.298,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,213.84,3484.75,0.0 -256,32,32768,512,ck,13,0,5.5756,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,192.58,3388.11,0.0 -256,32,32768,1536,ck,12,0,12.8162,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,251.34,4094.66,0.0 -256,32,36864,7168,ck,9,0,63.4973,a8w8_blockscale_1x128x128_256x32x256x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,266.33,4202.22,0.0 -256,64,64,7168,ck,8,0,14.0983,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.17,65.66,0.0 -256,64,128,7168,ck,8,0,14.5236,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.09,95.89,0.0 -256,64,512,7168,ck,8,0,20.5969,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.81,203.64,0.0 -256,64,576,7168,ck,8,0,20.6636,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.58,225.58,0.0 -256,64,1024,7168,ck,8,0,20.9843,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,44.77,377.89,0.0 -256,64,1536,7168,ck,8,0,21.2119,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,66.44,549.95,0.0 -256,64,2112,7168,ck,8,0,20.6114,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,94.01,769.86,0.0 -256,64,2240,7168,ck,8,0,20.6316,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,99.61,814.37,0.0 -256,64,3072,1536,ck,8,0,6.5488,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,92.23,795.58,0.0 -256,64,4096,512,ck,8,0,3.9182,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,68.51,677.4,0.0 -256,64,4096,7168,ck,8,0,21.2311,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,177.01,1429.18,0.0 -256,64,4608,7168,ck,7,0,23.1112,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,182.94,1474.55,0.0 -256,64,7168,256,cktile,0,0,3.3434,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,70.25,828.17,0.0 -256,64,7168,512,ck,13,0,4.4706,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,105.08,1033.48,0.0 -256,64,7168,2048,ck,7,0,8.8766,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,211.69,1771.92,0.0 -256,64,7168,2304,ck,7,0,9.7303,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,217.25,1806.73,0.0 -256,64,7168,4096,ck,7,0,15.6595,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,239.99,1950.24,0.0 -256,64,7168,4608,ck,7,0,16.6612,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,253.75,2055.23,0.0 -256,64,7168,16384,ck,18,0,53.066,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,283.28,2250.15,0.0 -256,64,7168,18432,ck,18,0,58.5515,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,288.83,2292.3,0.0 -256,64,8192,512,ck,13,0,4.4992,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,119.33,1172.57,0.0 -256,64,8192,1536,ck,18,0,8.3243,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,193.48,1649.36,0.0 -256,64,9216,7168,ck,18,0,25.8746,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,326.8,2616.41,0.0 -256,64,11264,1536,ck,18,0,8.4093,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,263.35,2240.57,0.0 -256,64,12288,1536,ck,18,0,8.4019,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,287.54,2445.34,0.0 -256,64,14336,1536,ck,18,0,8.8597,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,318.13,2703.64,0.0 -256,64,16384,512,ck,18,0,5.5061,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,195.01,1910.34,0.0 -256,64,20480,1536,ck,18,0,13.129,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,306.69,2603.17,0.0 -256,64,24576,1536,ck,18,0,13.7416,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,351.62,2983.11,0.0 -256,64,32768,512,ck,2,0,7.9558,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,269.93,2640.12,0.0 -256,64,32768,1536,ck,18,0,14.825,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,434.57,3684.6,0.0 -256,64,36864,7168,ck,14,0,75.9499,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,445.33,3547.32,0.0 -256,96,128,7168,ck,8,0,14.4064,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.23,113.16,0.0 -256,96,512,7168,ck,8,0,20.2501,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,34.8,220.07,0.0 -256,96,1024,7168,ck,8,0,20.6839,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,68.13,397.64,0.0 -256,96,2112,7168,ck,8,0,20.5046,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,141.76,791.65,0.0 -256,96,2240,7168,ck,8,0,20.4717,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,150.59,838.94,0.0 -256,96,3072,1536,ck,8,0,7.8324,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,115.67,696.58,0.0 -256,96,4096,512,ck,13,0,4.2826,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,94.02,684.8,0.0 -256,96,4096,7168,ck,7,0,23.3977,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,240.93,1317.85,0.0 -256,96,4608,7168,ck,7,0,23.8202,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,266.24,1452.67,0.0 -256,96,7168,256,ck,8,0,4.3182,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,81.59,749.35,0.0 -256,96,7168,512,ck,13,0,5.319,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,132.48,957.97,0.0 -256,96,7168,2048,ck,18,0,9.6386,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,292.43,1686.23,0.0 -256,96,7168,2304,ck,18,0,10.3846,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,305.35,1744.17,0.0 -256,96,7168,4096,ck,18,0,16.1889,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,348.21,1922.9,0.0 -256,96,7168,4608,ck,18,0,17.2691,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,367.23,2017.98,0.0 -256,96,7168,16384,ck,18,0,55.0061,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,409.93,2188.66,0.0 -256,96,7168,18432,ck,18,0,61.6242,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,411.64,2195.02,0.0 -256,96,8192,512,ck,13,0,5.3545,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,150.4,1086.25,0.0 -256,96,8192,1536,ck,18,0,8.203,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,294.52,1743.66,0.0 -256,96,9216,7168,ck,12,0,28.2134,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,449.56,2428.56,0.0 -256,96,11264,1536,ck,18,0,11.0633,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,300.26,1772.68,0.0 -256,96,12288,1536,ck,18,0,11.286,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,321.1,1894.48,0.0 -256,96,14336,1536,ck,18,0,11.6314,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,363.49,2142.48,0.0 -256,96,16384,512,ck,13,0,6.1392,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,262.35,1886.81,0.0 -256,96,20480,1536,ck,12,0,14.4047,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,419.29,2467.03,0.0 -256,96,24576,1536,ck,18,0,16.4815,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,439.75,2585.61,0.0 -256,96,32768,512,ck,13,0,9.4586,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,340.56,2444.11,0.0 -256,96,32768,1536,ck,0,0,19.4557,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,496.7,2917.94,0.0 -256,96,36864,7168,ck,12,0,95.4929,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,531.29,2848.45,0.0 -256,128,64,7168,ck,8,0,14.4734,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.11,96.22,0.0 -256,128,128,7168,ck,8,0,14.3264,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.39,130.37,0.0 -256,128,512,7168,ck,8,0,19.8912,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.23,237.22,0.0 -256,128,576,7168,ck,8,0,20.1014,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,52.58,258.38,0.0 -256,128,1024,7168,ck,8,0,20.4235,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,92.0,417.15,0.0 -256,128,1536,7168,ck,8,0,19.9837,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,141.04,616.54,0.0 -256,128,2112,7168,ck,8,0,21.1709,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,183.06,783.95,0.0 -256,128,2240,7168,ck,18,0,23.1376,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,177.65,758.39,0.0 -256,128,3072,1536,ck,7,0,7.2074,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,167.6,791.08,0.0 -256,128,4096,512,ck,13,0,4.3212,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,124.24,743.14,0.0 -256,128,4096,7168,ck,7,0,23.6181,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,318.24,1326.36,0.0 -256,128,4608,7168,ck,18,0,23.9089,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,353.66,1469.21,0.0 -256,128,7168,256,ck,18,0,4.5079,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,104.21,821.4,0.0 -256,128,7168,512,ck,13,0,5.3366,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,176.05,1043.84,0.0 -256,128,7168,2048,ck,18,0,9.4978,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,395.68,1766.43,0.0 -256,128,7168,2304,ck,18,0,10.3456,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,408.66,1802.21,0.0 -256,128,7168,4096,ck,18,0,16.0184,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,469.22,1980.19,0.0 -256,128,7168,4608,ck,18,0,17.0629,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,495.56,2077.9,0.0 -256,128,7168,16384,ck,18,0,56.7734,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,529.56,2137.84,0.0 -256,128,7168,18432,ck,18,0,62.4818,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,541.32,2181.67,0.0 -256,128,8192,512,ck,13,0,5.3945,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,199.04,1178.42,0.0 -256,128,8192,1536,ck,18,0,8.2154,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,392.1,1810.83,0.0 -256,128,9216,7168,ck,18,0,40.6919,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,415.6,1703.95,0.0 -256,128,11264,1536,ck,18,0,11.7531,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,376.85,1734.15,0.0 -256,128,12288,1536,ck,18,0,12.0014,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,402.61,1851.18,0.0 -256,128,14336,1536,ck,18,0,12.3486,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,456.5,2096.33,0.0 -256,128,16384,512,ck,9,0,8.0448,a8w8_blockscale_1x128x128_256x32x256x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,266.94,1572.25,0.0 -256,128,20480,1536,ck,18,0,16.796,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,479.46,2196.76,0.0 -256,128,24576,1536,ck,18,0,17.3757,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,556.16,2545.9,0.0 -256,128,32768,512,ck,2,0,10.8423,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,396.13,2327.12,0.0 -256,128,32768,1536,ck,0,0,19.2926,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,667.87,3053.86,0.0 -256,128,36864,7168,ck,18,0,105.9408,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,638.52,2591.97,0.0 -256,160,128,7168,ck,8,0,17.1227,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,17.15,122.96,0.0 -256,160,512,7168,ck,8,0,20.0008,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,58.72,249.03,0.0 -256,160,1024,7168,ck,8,0,20.4675,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,114.76,430.66,0.0 -256,160,2112,7168,ck,8,0,23.2184,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,208.65,730.52,0.0 -256,160,2240,7168,ck,18,0,23.4811,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,218.82,763.17,0.0 -256,160,3072,1536,ck,7,0,7.4447,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,202.82,798.88,0.0 -256,160,4096,512,ck,13,0,5.1861,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,129.4,672.91,0.0 -256,160,4096,7168,ck,18,0,24.2819,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,386.92,1310.35,0.0 -256,160,4608,7168,ck,18,0,25.144,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,420.36,1417.9,0.0 -256,160,7168,256,ck,13,0,5.0474,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,116.34,826.11,0.0 -256,160,7168,512,ck,13,0,5.7215,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,205.26,1056.66,0.0 -256,160,7168,2048,ck,18,0,12.9511,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,362.72,1335.91,0.0 -256,160,7168,2304,ck,18,0,14.0589,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,375.91,1364.08,0.0 -256,160,7168,4096,ck,18,0,22.8022,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,412.03,1416.94,0.0 -256,160,7168,4608,ck,18,0,25.9107,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,407.93,1391.75,0.0 -256,160,7168,16384,ck,18,0,80.0493,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,469.47,1528.5,0.0 -256,160,7168,18432,ck,18,0,89.0583,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,474.73,1542.4,0.0 -256,160,8192,512,ck,13,0,6.3784,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,210.43,1081.41,0.0 -256,160,8192,1536,ck,18,0,11.0356,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,364.87,1400.02,0.0 -256,160,9216,7168,ck,18,0,42.7828,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,494.11,1639.82,0.0 -256,160,11264,1536,ck,12,0,14.0267,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,394.71,1507.96,0.0 -256,160,12288,1536,ck,12,0,14.4605,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,417.68,1594.16,0.0 -256,160,14336,1536,ck,18,0,15.5241,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,453.9,1729.79,0.0 -256,160,16384,512,ck,13,0,8.5655,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,313.39,1601.0,0.0 -256,160,20480,1536,ck,2,0,18.3532,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,548.48,2084.47,0.0 -256,160,24576,1536,ck,18,0,23.7575,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,508.45,1930.29,0.0 -256,160,32768,512,ck,18,0,13.356,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,401.97,2047.39,0.0 -256,160,32768,1536,ck,2,0,27.1819,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,592.53,2246.46,0.0 -256,160,36864,7168,cktile,11,0,93.7724,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,901.73,2955.93,0.0 -256,192,128,7168,ck,8,0,16.9935,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.73,137.87,0.0 -256,192,512,7168,ck,8,0,19.9037,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,70.81,263.41,0.0 -256,192,1024,7168,ck,8,0,20.4827,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,137.61,444.74,0.0 -256,192,2112,7168,ck,18,0,22.9699,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,253.08,754.29,0.0 -256,192,2240,7168,ck,18,0,23.148,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,266.36,790.25,0.0 -256,192,3072,1536,ck,18,0,8.4458,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,214.54,733.28,0.0 -256,192,4096,512,ck,13,0,5.1841,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,155.34,726.9,0.0 -256,192,4096,7168,ck,18,0,23.8819,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,472.09,1352.88,0.0 -256,192,4608,7168,ck,18,0,24.2883,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,522.21,1489.44,0.0 -256,192,7168,256,ck,13,0,5.1944,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,135.65,892.63,0.0 -256,192,7168,512,ck,13,0,6.009,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,234.53,1085.18,0.0 -256,192,7168,2048,ck,18,0,13.2851,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,424.32,1341.79,0.0 -256,192,7168,2304,ck,18,0,14.6467,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,432.98,1345.69,0.0 -256,192,7168,4096,ck,18,0,24.2362,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,465.18,1357.44,0.0 -256,192,7168,4608,ck,18,0,25.7255,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,493.04,1425.33,0.0 -256,192,7168,16384,ck,18,0,84.2651,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,535.18,1463.7,0.0 -256,192,7168,18432,ck,18,0,93.2472,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,544.08,1484.36,0.0 -256,192,8192,512,ck,13,0,6.479,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,248.59,1148.07,0.0 -256,192,8192,1536,ck,18,0,11.4405,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,422.35,1400.6,0.0 -256,192,9216,7168,ck,18,0,43.0984,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,588.59,1646.82,0.0 -256,192,11264,1536,ck,18,0,14.8667,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,446.89,1474.56,0.0 -256,192,12288,1536,ck,18,0,15.56,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,465.79,1535.21,0.0 -256,192,14336,1536,ck,18,0,16.0769,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,525.95,1730.44,0.0 -256,192,16384,512,ck,18,0,8.9495,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,359.93,1651.31,0.0 -256,192,20480,1536,ck,2,0,19.8064,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,609.88,2000.19,0.0 -256,192,24576,1536,ck,18,0,25.1336,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,576.74,1889.14,0.0 -256,192,32768,512,ck,17,0,14.141,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,455.59,2083.19,0.0 -256,192,32768,1536,ck,2,0,28.3939,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,680.69,2226.16,0.0 -256,192,36864,7168,ck,14,0,126.8721,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,799.77,2205.16,0.0 -256,224,128,7168,ck,8,0,17.8184,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.07,144.82,0.0 -256,224,512,7168,ck,8,0,20.3114,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,80.95,271.03,0.0 -256,224,1024,7168,ck,8,0,20.7811,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,158.24,452.55,0.0 -256,224,2112,7168,ck,18,0,23.6685,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,286.55,747.43,0.0 -256,224,2240,7168,ck,18,0,23.7107,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,303.37,787.22,0.0 -256,224,3072,1536,ck,18,0,8.2059,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,257.61,784.67,0.0 -256,224,4096,512,ck,13,0,5.3431,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,175.84,757.4,0.0 -256,224,4096,7168,ck,18,0,25.5505,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,514.8,1283.76,0.0 -256,224,4608,7168,ck,12,0,26.9231,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,549.62,1363.15,0.0 -256,224,7168,256,ck,18,0,5.6461,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,145.6,903.92,0.0 -256,224,7168,512,ck,18,0,6.4236,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,255.96,1089.1,0.0 -256,224,7168,2048,ck,18,0,13.3235,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,493.61,1377.27,0.0 -256,224,7168,2304,ck,18,0,14.4191,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,513.12,1403.86,0.0 -256,224,7168,4096,ck,18,0,23.8325,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,551.91,1405.18,0.0 -256,224,7168,4608,ck,18,0,27.3172,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,541.69,1364.47,0.0 -256,224,7168,16384,ck,18,0,87.6811,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,600.05,1417.89,0.0 -256,224,7168,18432,ck,18,0,99.6528,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,593.96,1399.47,0.0 -256,224,8192,512,ck,17,0,6.4757,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,290.17,1232.15,0.0 -256,224,8192,1536,ck,18,0,11.52,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,489.34,1440.71,0.0 -256,224,9216,7168,ck,12,0,52.8597,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,559.88,1358.21,0.0 -256,224,11264,1536,ck,18,0,15.2931,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,506.83,1483.8,0.0 -256,224,12288,1536,ck,18,0,16.4055,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,515.42,1507.02,0.0 -256,224,14336,1536,ck,0,0,17.5524,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,562.03,1640.04,0.0 -256,224,16384,512,ck,18,0,10.2555,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,366.45,1544.86,0.0 -256,224,20480,1536,ck,18,0,24.9844,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,564.07,1640.08,0.0 -256,224,24576,1536,ck,2,0,26.2172,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,645.05,1872.92,0.0 -256,224,32768,512,ck,0,0,16.1208,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,466.24,1958.46,0.0 -256,224,32768,1536,ck,0,0,29.6624,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,760.17,2203.32,0.0 -256,224,36864,7168,ck,0,0,153.8167,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,769.62,1835.7,0.0 -256,256,64,7168,ck,8,0,16.0919,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.6,144.58,0.0 -256,256,128,7168,ck,8,0,18.917,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,24.83,148.97,0.0 -256,256,512,7168,ck,8,0,20.1783,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,93.12,285.81,0.0 -256,256,576,7168,ck,8,0,20.4521,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,103.36,306.02,0.0 -256,256,1024,7168,ck,8,0,20.8683,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,180.09,464.79,0.0 -256,256,1536,7168,ck,7,0,22.5366,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,250.13,604.86,0.0 -256,256,2112,7168,ck,18,0,23.0603,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,336.12,782.95,0.0 -256,256,2240,7168,ck,18,0,23.1962,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,354.4,820.75,0.0 -256,256,3072,1536,ck,18,0,8.6547,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,279.15,772.37,0.0 -256,256,4096,512,ck,13,0,5.143,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,208.78,841.02,0.0 -256,256,4096,7168,ck,18,0,24.9567,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,602.34,1334.0,0.0 -256,256,4608,7168,ck,18,0,36.0444,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,469.18,1032.74,0.0 -256,256,7168,256,cktile,6,0,5.3481,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,175.67,1041.6,0.0 -256,256,7168,512,ck,18,0,6.913,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,271.81,1080.73,0.0 -256,256,7168,2048,ck,18,0,13.6951,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,548.82,1378.18,0.0 -256,256,7168,2304,ck,18,0,15.2977,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,552.74,1358.04,0.0 -256,256,7168,4096,ck,18,0,25.2675,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,594.93,1348.72,0.0 -256,256,7168,4608,ck,18,0,27.5468,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,613.92,1375.11,0.0 -256,256,7168,16384,ck,18,0,89.7873,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,669.69,1395.57,0.0 -256,256,7168,18432,ck,18,0,99.4348,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,680.3,1413.08,0.0 -256,256,8192,512,ck,18,0,7.299,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,294.22,1167.24,0.0 -256,256,8192,1536,ck,18,0,11.8143,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,545.31,1453.36,0.0 -256,256,9216,7168,ck,0,0,56.4314,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,599.36,1286.76,0.0 -256,256,11264,1536,ck,18,0,15.893,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,557.38,1476.24,0.0 -256,256,12288,1536,ck,0,0,16.6451,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,580.57,1535.53,0.0 -256,256,14336,1536,ck,0,0,17.292,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,651.99,1720.64,0.0 -256,256,16384,512,ck,0,0,10.4597,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,410.62,1616.52,0.0 -256,256,20480,1536,ck,18,0,23.7982,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,676.78,1778.97,0.0 -256,256,24576,1536,ck,2,0,27.2881,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,708.27,1858.86,0.0 -256,256,32768,512,ck,0,0,16.3211,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,526.31,2063.92,0.0 -256,256,32768,1536,ck,0,0,31.1808,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,826.46,2164.86,0.0 -256,256,36864,7168,ck,0,0,160.9084,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,840.8,1770.89,0.0 -256,288,128,7168,ck,8,0,19.0991,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,27.67,159.99,0.0 -256,288,512,7168,ck,8,0,20.3923,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,103.66,295.67,0.0 -256,288,1024,7168,ck,8,0,21.7244,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,194.61,460.05,0.0 -256,288,2112,7168,ck,18,0,24.2926,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,358.96,758.24,0.0 -256,288,2240,7168,ck,18,0,24.2274,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,381.73,801.2,0.0 -256,288,3072,1536,ck,12,0,8.9072,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,305.14,778.07,0.0 -256,288,4096,512,ck,13,0,5.4656,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,221.01,842.34,0.0 -256,288,4096,7168,ck,18,0,36.2776,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,466.17,931.26,0.0 -256,288,4608,7168,ck,18,0,34.9489,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,544.38,1080.11,0.0 -256,288,7168,256,ck,13,0,5.8969,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,179.24,1023.84,0.0 -256,288,7168,512,ck,12,0,7.8664,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,268.73,1010.15,0.0 -256,288,7168,2048,ck,12,0,16.1075,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,524.96,1204.32,0.0 -256,288,7168,2304,ck,12,0,17.6223,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,539.81,1209.12,0.0 -256,288,7168,4096,ck,12,0,29.9438,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,564.77,1157.79,0.0 -256,288,7168,4608,ck,12,0,34.1004,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,557.92,1128.61,0.0 -256,288,7168,16384,ck,12,0,111.0457,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,609.17,1137.26,0.0 -256,288,7168,18432,ck,12,0,123.9056,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,614.19,1142.46,0.0 -256,288,8192,512,ck,13,0,8.3615,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,288.93,1083.58,0.0 -256,288,8192,1536,ck,18,0,15.326,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,472.91,1157.76,0.0 -256,288,9216,7168,ck,0,0,60.4578,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,629.38,1214.62,0.0 -256,288,11264,1536,ck,2,0,18.8467,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,528.78,1285.74,0.0 -256,288,12288,1536,ck,2,0,19.4542,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,558.83,1356.76,0.0 -256,288,14336,1536,ck,18,0,23.6832,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,535.55,1297.12,0.0 -256,288,16384,512,ck,18,0,12.3165,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,392.31,1459.28,0.0 -256,288,20480,1536,ck,0,0,28.7904,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,629.36,1517.73,0.0 -256,288,24576,1536,ck,14,0,34.4764,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,630.67,1518.34,0.0 -256,288,32768,512,ck,17,0,20.5268,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,470.78,1744.01,0.0 -256,288,32768,1536,cktile,26,0,32.7131,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,886.22,2129.07,0.0 -256,288,36864,7168,ck,14,0,185.2145,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,821.77,1552.47,0.0 -256,320,128,7168,ck,8,0,19.3483,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,30.35,170.21,0.0 -256,320,512,7168,ck,8,0,20.4108,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,115.08,308.24,0.0 -256,320,1024,7168,ck,8,0,21.7255,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,216.23,473.6,0.0 -256,320,2112,7168,ck,18,0,23.4441,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,413.27,801.24,0.0 -256,320,2240,7168,ck,18,0,23.4986,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,437.3,841.91,0.0 -256,320,3072,1536,ck,12,0,8.7907,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,343.53,816.34,0.0 -256,320,4096,512,ck,13,0,6.1473,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,218.34,794.24,0.0 -256,320,4096,7168,ck,18,0,36.1058,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,520.43,949.3,0.0 -256,320,4608,7168,ck,18,0,37.9116,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,557.59,1009.53,0.0 -256,320,7168,256,ck,18,0,6.0807,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,193.14,1069.69,0.0 -256,320,7168,512,ck,13,0,8.1355,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,288.71,1035.14,0.0 -256,320,7168,2048,ck,18,0,17.897,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,524.96,1113.2,0.0 -256,320,7168,2304,ck,18,0,20.2519,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,521.91,1078.41,0.0 -256,320,7168,4096,ck,18,0,33.2548,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,565.05,1060.25,0.0 -256,320,7168,4608,ck,18,0,34.9015,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,605.68,1120.07,0.0 -256,320,7168,16384,ck,2,0,114.1333,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,658.55,1115.11,0.0 -256,320,7168,18432,ck,2,0,129.9128,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,650.88,1097.71,0.0 -256,320,8192,512,ck,13,0,8.9484,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,299.98,1072.93,0.0 -256,320,8192,1536,ck,18,0,15.3877,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,523.34,1190.39,0.0 -256,320,9216,7168,ck,0,0,60.0342,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,704.24,1236.83,0.0 -256,320,11264,1536,ck,2,0,18.2441,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,606.93,1370.41,0.0 -256,320,12288,1536,ck,14,0,19.4937,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,619.67,1396.87,0.0 -256,320,14336,1536,ck,18,0,24.0771,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,585.32,1316.05,0.0 -256,320,16384,512,ck,18,0,12.5931,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,426.32,1511.8,0.0 -256,320,20480,1536,ck,0,0,29.2128,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,689.17,1542.34,0.0 -256,320,24576,1536,ck,14,0,34.3884,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,702.54,1569.39,0.0 -256,320,32768,512,ck,2,0,21.6599,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,495.73,1750.36,0.0 -256,320,32768,1536,cktile,28,0,33.3128,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,966.96,2155.17,0.0 -256,320,36864,7168,ck,0,0,196.9898,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,858.49,1472.81,0.0 -256,352,128,7168,ck,8,0,19.7316,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,32.74,178.94,0.0 -256,352,512,7168,ck,8,0,20.4097,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,126.59,321.1,0.0 -256,352,1024,7168,ck,8,0,22.0666,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,234.17,479.64,0.0 -256,352,2112,7168,ck,18,0,24.9136,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,427.79,768.61,0.0 -256,352,2240,7168,ck,18,0,24.7563,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,456.6,814.19,0.0 -256,352,3072,1536,ck,18,0,10.6109,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,313.06,699.46,0.0 -256,352,4096,512,ck,17,0,6.2324,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,236.89,828.09,0.0 -256,352,4096,7168,ck,18,0,39.6236,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,521.65,877.43,0.0 -256,352,4608,7168,ck,18,0,35.2899,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,658.92,1099.39,0.0 -256,352,7168,256,ck,13,0,6.2709,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,206.01,1111.71,0.0 -256,352,7168,512,ck,13,0,8.2786,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,312.09,1074.64,0.0 -256,352,7168,2048,ck,18,0,18.526,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,557.85,1103.7,0.0 -256,352,7168,2304,ck,18,0,20.7383,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,560.63,1078.79,0.0 -256,352,7168,4096,ck,0,0,34.4901,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,599.29,1039.38,0.0 -256,352,7168,4608,ck,18,0,37.1048,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,626.69,1069.9,0.0 -256,352,7168,16384,ck,0,0,120.4757,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,686.26,1064.56,0.0 -256,352,7168,18432,ck,0,0,135.6072,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,685.9,1059.35,0.0 -256,352,8192,512,ck,18,0,8.9245,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,330.86,1136.39,0.0 -256,352,8192,1536,ck,18,0,15.8912,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,557.44,1188.76,0.0 -256,352,9216,7168,ck,0,0,62.6434,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,742.4,1198.39,0.0 -256,352,11264,1536,ck,18,0,23.2507,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,523.87,1108.44,0.0 -256,352,12288,1536,ck,18,0,23.3287,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,569.58,1203.06,0.0 -256,352,14336,1536,ck,2,0,25.9576,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,597.21,1257.95,0.0 -256,352,16384,512,ck,18,0,13.8423,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,426.63,1452.3,0.0 -256,352,20480,1536,ck,0,0,29.7336,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,744.81,1561.06,0.0 -256,352,24576,1536,cktile,26,0,31.6868,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,838.68,1754.39,0.0 -256,352,32768,512,ck,0,0,22.2946,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,529.78,1795.33,0.0 -256,352,32768,1536,cktile,26,0,33.7222,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1050.75,2192.65,0.0 -256,352,36864,7168,cktile,26,0,160.365,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1160.01,1825.31,0.0 -256,384,128,7168,ck,8,0,19.7154,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,35.74,191.14,0.0 -256,384,512,7168,ck,8,0,20.3983,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,138.18,334.13,0.0 -256,384,1024,7168,ck,8,0,22.3438,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,252.29,486.89,0.0 -256,384,2112,7168,ck,18,0,23.7978,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,488.56,819.96,0.0 -256,384,2240,7168,ck,18,0,23.7826,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,518.5,863.2,0.0 -256,384,3072,1536,ck,18,0,10.5698,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,342.85,725.44,0.0 -256,384,4096,512,ck,13,0,6.3161,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,255.0,861.21,0.0 -256,384,4096,7168,ck,18,0,34.9755,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,644.7,1008.09,0.0 -256,384,4608,7168,ck,18,0,42.9873,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,590.11,914.73,0.0 -256,384,7168,256,ck,18,0,6.5069,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,216.58,1143.15,0.0 -256,384,7168,512,ck,18,0,8.5934,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,327.99,1090.56,0.0 -256,384,7168,2048,ck,18,0,18.9583,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,594.69,1106.19,0.0 -256,384,7168,2304,ck,0,0,20.7129,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,612.35,1105.82,0.0 -256,384,7168,4096,ck,0,0,34.1125,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,661.01,1068.17,0.0 -256,384,7168,4608,ck,0,0,37.3568,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,679.05,1078.91,0.0 -256,384,7168,16384,ck,0,0,116.6161,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,773.43,1108.23,0.0 -256,384,7168,18432,ck,0,0,137.0626,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,740.31,1055.75,0.0 -256,384,8192,512,ck,18,0,8.908,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,361.61,1199.19,0.0 -256,384,8192,1536,ck,18,0,15.9312,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,606.59,1221.77,0.0 -256,384,9216,7168,ck,0,0,62.323,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,814.05,1217.7,0.0 -256,384,11264,1536,ck,18,0,22.6307,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,587.15,1172.84,0.0 -256,384,12288,1536,ck,18,0,23.4391,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,618.43,1233.04,0.0 -256,384,14336,1536,ck,2,0,25.7776,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,656.05,1304.23,0.0 -256,384,16384,512,ck,17,0,13.9619,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,461.43,1516.14,0.0 -256,384,20480,1536,ck,0,0,28.7288,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,840.94,1662.99,0.0 -256,384,24576,1536,cktile,11,0,32.299,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,897.58,1771.35,0.0 -256,384,32768,512,ck,0,0,23.0787,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,558.3,1825.91,0.0 -256,384,32768,1536,cktile,28,0,34.4376,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1122.46,2209.43,0.0 -256,384,36864,7168,ck,0,0,210.4362,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,964.36,1403.3,0.0 -256,416,128,7168,ck,8,0,20.0269,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,38.12,200.03,0.0 -256,416,512,7168,ck,8,0,20.6355,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,147.97,343.0,0.0 -256,416,1024,7168,ck,8,0,22.7135,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,268.87,491.95,0.0 -256,416,2112,7168,ck,18,0,25.1696,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,500.42,789.76,0.0 -256,416,2240,7168,ck,18,0,26.0516,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,512.78,802.33,0.0 -256,416,3072,1536,ck,18,0,10.8287,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,362.54,730.79,0.0 -256,416,4096,512,ck,17,0,6.4007,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,272.6,893.34,0.0 -256,416,4096,7168,ck,18,0,38.7328,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,630.67,922.99,0.0 -256,416,4608,7168,ck,18,0,40.166,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,684.19,992.03,0.0 -256,416,7168,256,ck,13,0,6.5711,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,232.34,1203.04,0.0 -256,416,7168,512,ck,13,0,9.2876,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,328.77,1060.21,0.0 -256,416,7168,2048,ck,0,0,21.113,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,578.5,1018.13,0.0 -256,416,7168,2304,ck,0,0,22.2607,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,617.26,1052.86,0.0 -256,416,7168,4096,ck,0,0,37.3648,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,653.76,990.98,0.0 -256,416,7168,4608,ck,0,0,37.6687,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,729.55,1086.07,0.0 -256,416,7168,16384,ck,0,0,127.2482,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,767.87,1023.35,0.0 -256,416,7168,18432,ck,0,0,140.696,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,781.29,1035.94,0.0 -256,416,8192,512,ck,0,0,10.1211,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,344.79,1108.88,0.0 -256,416,8192,1536,ck,0,0,17.5627,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,596.09,1140.92,0.0 -256,416,9216,7168,ck,14,0,69.5662,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,790.07,1102.69,0.0 -256,416,11264,1536,ck,18,0,25.0123,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,575.51,1091.95,0.0 -256,416,12288,1536,ck,2,0,25.9648,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,604.8,1145.28,0.0 -256,416,14336,1536,ck,0,0,27.6412,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,662.8,1251.27,0.0 -256,416,16384,512,ck,18,0,15.1957,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,459.3,1463.12,0.0 -256,416,20480,1536,cktile,28,0,31.9647,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,818.79,1537.18,0.0 -256,416,24576,1536,ck,0,0,43.5537,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,721.11,1350.86,0.0 -256,416,32768,512,ck,0,0,27.9556,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,499.31,1582.98,0.0 -256,416,32768,1536,ck,0,0,57.2349,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,731.65,1366.89,0.0 -256,416,36864,7168,cktile,28,0,174.5087,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1259.81,1707.04,0.0 -256,448,128,7168,ck,8,0,19.9415,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.22,212.8,0.0 -256,448,512,7168,ck,8,0,20.6355,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,159.35,355.7,0.0 -256,448,1024,7168,ck,18,0,23.1171,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,284.49,496.12,0.0 -256,448,2112,7168,ck,18,0,23.7136,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,572.01,853.62,0.0 -256,448,2240,7168,ck,18,0,25.0899,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,573.4,847.94,0.0 -256,448,3072,1536,ck,18,0,10.8713,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,388.9,750.53,0.0 -256,448,4096,512,ck,13,0,6.8369,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,274.84,877.09,0.0 -256,448,4096,7168,ck,18,0,42.0773,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,625.2,861.31,0.0 -256,448,4608,7168,ck,18,0,44.0377,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,672.04,916.72,0.0 -256,448,7168,256,ck,18,0,7.088,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,231.96,1181.18,0.0 -256,448,7168,512,ck,2,0,9.9265,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,331.27,1039.83,0.0 -256,448,7168,2048,ck,0,0,21.0124,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,625.98,1047.96,0.0 -256,448,7168,2304,ck,0,0,21.9519,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,674.09,1091.92,0.0 -256,448,7168,4096,ck,0,0,37.988,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,692.5,990.25,0.0 -256,448,7168,4608,ck,0,0,39.782,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,743.93,1043.61,0.0 -256,448,7168,16384,ck,0,0,127.2333,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,827.04,1031.2,0.0 -256,448,7168,18432,ck,0,0,141.5706,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,836.19,1036.94,0.0 -256,448,8192,512,ck,0,0,10.2386,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,367.05,1148.96,0.0 -256,448,8192,1536,ck,0,0,17.232,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,654.26,1196.09,0.0 -256,448,9216,7168,ck,14,0,69.6246,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,850.13,1113.53,0.0 -256,448,11264,1536,ck,18,0,24.1979,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,640.64,1160.52,0.0 -256,448,12288,1536,ck,2,0,25.5359,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,662.26,1197.24,0.0 -256,448,14336,1536,ck,0,0,27.4252,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,719.41,1296.37,0.0 -256,448,16384,512,ck,18,0,15.8837,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,473.2,1466.79,0.0 -256,448,20480,1536,ck,2,0,40.9444,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,688.39,1233.27,0.0 -256,448,24576,1536,ck,0,0,45.0677,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,750.49,1341.47,0.0 -256,448,32768,512,ck,0,0,27.7693,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,541.33,1669.71,0.0 -256,448,32768,1536,ck,0,0,58.9473,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,765.04,1363.59,0.0 -256,448,36864,7168,ck,0,0,244.4989,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,968.35,1228.97,0.0 -256,480,128,7168,ck,8,0,20.0486,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.93,223.51,0.0 -256,480,512,7168,ck,8,0,20.5967,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,171.06,369.1,0.0 -256,480,1024,7168,ck,7,0,23.5657,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,299.01,499.19,0.0 -256,480,2112,7168,ck,18,0,32.4638,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,447.68,634.77,0.0 -256,480,2240,7168,ck,18,0,34.7895,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,443.07,622.24,0.0 -256,480,3072,1536,ck,18,0,11.1376,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,406.72,754.65,0.0 -256,480,4096,512,ck,18,0,6.3449,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,317.3,988.99,0.0 -256,480,4096,7168,ck,2,0,44.0884,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,639.3,833.17,0.0 -256,480,4608,7168,ck,18,0,52.5105,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,603.86,778.79,0.0 -256,480,7168,256,ck,13,0,7.4843,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,235.37,1181.03,0.0 -256,480,7168,512,ck,18,0,10.0728,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,349.78,1071.9,0.0 -256,480,7168,2048,ck,0,0,21.2321,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,663.75,1061.81,0.0 -256,480,7168,2304,ck,0,0,22.08,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,718.05,1109.7,0.0 -256,480,7168,4096,ck,0,0,39.2544,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,718.03,973.33,0.0 -256,480,7168,4608,ck,0,0,40.9212,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,774.88,1029.38,0.0 -256,480,7168,16384,ck,0,0,129.3282,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,871.76,1022.1,0.0 -256,480,7168,18432,ck,0,0,144.4652,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,877.97,1023.42,0.0 -256,480,8192,512,ck,0,0,10.3243,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,390.01,1191.79,0.0 -256,480,8192,1536,ck,0,0,17.4656,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,691.62,1212.93,0.0 -256,480,9216,7168,ck,0,0,92.5317,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,685.36,846.72,0.0 -256,480,11264,1536,ck,2,0,26.6676,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,622.83,1081.92,0.0 -256,480,12288,1536,ck,0,0,26.8339,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,675.24,1170.46,0.0 -256,480,14336,1536,ck,0,0,27.936,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,756.7,1307.27,0.0 -256,480,16384,512,ck,0,0,16.0696,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,501.14,1516.09,0.0 -256,480,20480,1536,cktile,27,0,32.2625,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,936.04,1607.3,0.0 -256,480,24576,1536,ck,0,0,46.2753,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,783.11,1341.51,0.0 -256,480,32768,512,ck,0,0,29.29,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,549.88,1655.18,0.0 -256,480,32768,1536,ck,0,0,57.6721,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,837.81,1430.96,0.0 -256,480,36864,7168,cktile,12,0,181.862,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,1394.86,1666.49,0.0 -256,512,64,7168,ck,8,0,19.8437,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.67,211.37,0.0 -256,512,128,7168,ck,8,0,19.9742,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.04,236.23,0.0 -256,512,512,7168,ck,8,0,20.7315,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,181.27,379.34,0.0 -256,512,576,7168,ck,18,0,22.9789,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,183.99,365.06,0.0 -256,512,1024,7168,ck,18,0,23.1935,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,324.06,519.91,0.0 -256,512,1536,7168,ck,18,0,23.3788,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,482.24,695.2,0.0 -256,512,2112,7168,ck,18,0,32.0652,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,483.46,654.03,0.0 -256,512,2240,7168,ck,18,0,34.4425,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,477.37,639.33,0.0 -256,512,3072,1536,ck,18,0,11.2304,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,430.25,770.3,0.0 -256,512,4096,512,ck,18,0,6.7823,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,316.63,966.28,0.0 -256,512,4096,7168,ck,18,0,44.5905,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,674.24,834.81,0.0 -256,512,4608,7168,ck,18,0,53.8717,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,627.84,768.84,0.0 -256,512,7168,256,cktile,10,0,7.6955,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,244.17,1209.29,0.0 -256,512,7168,512,ck,0,0,9.9586,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,377.37,1131.91,0.0 -256,512,7168,2048,ck,0,0,20.7712,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,723.71,1110.61,0.0 -256,512,7168,2304,ck,0,0,22.3331,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,757.24,1120.97,0.0 -256,512,7168,4096,ck,0,0,37.9824,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,791.54,1021.45,0.0 -256,512,7168,4608,ck,0,0,38.6734,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,874.58,1104.88,0.0 -256,512,7168,16384,ck,0,0,126.7885,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,948.5,1050.33,0.0 -256,512,7168,18432,ck,0,0,140.2762,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,964.46,1061.46,0.0 -256,512,8192,512,ck,0,0,10.2043,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,420.9,1258.79,0.0 -256,512,8192,1536,ck,0,0,17.0218,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,756.96,1278.24,0.0 -256,512,9216,7168,ck,18,0,93.1119,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,726.5,850.24,0.0 -256,512,11264,1536,ck,0,0,26.045,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,680.24,1137.35,0.0 -256,512,12288,1536,ck,2,0,26.2594,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,736.02,1227.89,0.0 -256,512,14336,1536,ck,0,0,27.3124,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,825.58,1372.51,0.0 -256,512,16384,512,ck,0,0,16.3731,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,524.64,1553.03,0.0 -256,512,20480,1536,cktile,27,0,32.2441,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,999.01,1650.39,0.0 -256,512,24576,1536,ck,0,0,46.3688,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,833.64,1373.79,0.0 -256,512,32768,512,ck,0,0,29.3632,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,585.08,1723.03,0.0 -256,512,32768,1536,ck,0,0,58.6281,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,879.09,1444.23,0.0 -256,512,36864,7168,ck,0,0,256.8593,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1053.43,1189.99,0.0 -256,1024,64,7168,ck,8,0,20.1439,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.64,393.66,0.0 -256,1024,128,7168,ck,8,0,20.3747,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,92.22,418.15,0.0 -256,1024,512,7168,ck,7,0,22.8839,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,328.45,526.95,0.0 -256,1024,576,7168,ck,18,0,23.2238,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,364.1,544.63,0.0 -256,1024,1024,7168,ck,18,0,24.4927,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,613.75,684.99,0.0 -256,1024,1536,7168,ck,18,0,38.4168,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,586.95,559.54,0.0 -256,1024,2112,7168,ck,18,0,51.2969,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,604.41,522.53,0.0 -256,1024,2240,7168,ck,18,0,53.7053,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,612.29,521.06,0.0 -256,1024,3072,1536,ck,18,0,16.0053,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,603.78,786.17,0.0 -256,1024,4096,512,ck,0,0,10.0853,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,425.86,1091.69,0.0 -256,1024,4096,7168,ck,0,0,63.9434,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,940.36,705.14,0.0 -256,1024,4608,7168,cktile,11,0,85.0782,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,795.1,585.43,0.0 -256,1024,7168,256,cktile,10,0,11.2282,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,334.7,1494.2,0.0 -256,1024,7168,512,ck,0,0,15.4877,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,485.3,1218.67,0.0 -256,1024,7168,2048,ck,0,0,34.0461,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,883.06,923.96,0.0 -256,1024,7168,2304,ck,0,0,35.8476,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,943.52,936.03,0.0 -256,1024,7168,4096,ck,0,0,63.147,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,952.22,763.84,0.0 -256,1024,7168,4608,ck,0,0,71.2174,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,949.85,736.18,0.0 -256,1024,7168,16384,ck,0,0,216.3222,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1111.85,688.31,0.0 -256,1024,7168,18432,ck,0,0,254.6017,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1062.77,650.72,0.0 -256,1024,8192,512,ck,0,0,15.952,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,538.49,1347.53,0.0 -256,1024,8192,1536,ck,0,0,29.156,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,883.86,1060.95,0.0 -256,1024,9216,7168,cktile,28,0,88.3744,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1530.89,1044.13,0.0 -256,1024,11264,1536,ck,0,0,42.788,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,828.12,980.25,0.0 -256,1024,12288,1536,ck,0,0,45.7325,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,845.23,997.39,0.0 -256,1024,14336,1536,ck,0,0,54.4605,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,828.07,972.32,0.0 -256,1024,16384,512,ck,0,0,29.472,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,582.92,1440.94,0.0 -256,1024,20480,1536,ck,0,0,74.004,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,870.55,1013.1,0.0 -256,1024,24576,1536,ck,0,0,84.1555,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,918.65,1065.33,0.0 -256,1024,32768,512,ck,0,0,54.7645,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,627.41,1541.33,0.0 -256,1024,32768,1536,ck,0,0,109.5044,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,941.32,1086.84,0.0 -256,1024,36864,7168,ck,0,0,454.1379,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1191.63,764.26,0.0 -256,1536,512,7168,ck,18,0,23.412,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,481.56,694.21,0.0 -256,1536,576,7168,ck,18,0,24.205,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,524.01,698.55,0.0 -256,1536,1536,7168,ck,18,0,52.7469,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,641.23,506.92,0.0 -256,1536,3072,1536,ck,18,0,23.0151,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,629.83,717.58,0.0 -256,1536,4096,512,ck,18,0,13.642,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,472.25,1133.74,0.0 -256,1536,4608,7168,cktile,11,0,90.2343,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1124.5,644.94,0.0 -256,1536,7168,256,cktile,10,0,13.3199,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,423.21,1820.46,0.0 -256,1536,7168,2048,cktile,11,0,37.6651,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1197.32,1057.9,0.0 -256,1536,7168,2304,cktile,28,0,40.5466,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1251.26,1037.67,0.0 -256,2048,64,7168,ck,8,0,19.7903,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,94.95,778.21,0.0 -256,2048,128,7168,ck,8,0,20.2497,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,185.59,796.15,0.0 -256,2048,512,7168,ck,18,0,24.9633,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,602.18,819.09,0.0 -256,2048,576,7168,ck,18,0,33.9564,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,498.03,623.39,0.0 -256,2048,1024,7168,ck,18,0,42.9028,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,700.76,611.02,0.0 -256,2048,1536,7168,ck,0,0,57.9353,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,778.41,552.02,0.0 -256,2048,2112,7168,ck,2,0,88.3043,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,702.21,435.65,0.0 -256,2048,2240,7168,ck,2,0,89.9975,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,730.76,443.47,0.0 -256,2048,3072,1536,ck,0,0,25.9919,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,743.59,786.68,0.0 -256,2048,4096,512,ck,0,0,15.9389,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,538.93,1249.96,0.0 -256,2048,4096,7168,ck,0,0,114.7033,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1048.44,530.21,0.0 -256,2048,4608,7168,cktile,27,0,88.3939,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1530.55,753.27,0.0 -256,2048,7168,256,cktile,10,0,18.4391,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,407.62,1720.23,0.0 -256,2048,7168,512,ck,0,0,27.0315,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,556.11,1260.7,0.0 -256,2048,7168,2048,ck,0,0,68.5722,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,876.88,703.41,0.0 -256,2048,7168,2304,ck,0,0,69.6842,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,970.75,726.04,0.0 -256,2048,7168,4096,ck,0,0,118.0138,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1019.03,568.65,0.0 -256,2048,7168,4608,ck,0,0,128.9241,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1049.39,557.13,0.0 -256,2048,7168,16384,ck,0,0,415.9935,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1156.36,433.55,0.0 -256,2048,7168,18432,ck,0,0,473.0973,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1143.88,421.12,0.0 -256,2048,8192,512,ck,0,0,29.4968,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,582.43,1315.31,0.0 -256,2048,8192,1536,ck,0,0,56.6629,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,909.58,869.76,0.0 -256,2048,9216,7168,cktile,26,0,172.1216,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1572.05,688.4,0.0 -256,2048,11264,1536,cktile,27,0,62.8212,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1128.07,1059.91,0.0 -256,2048,12288,1536,ck,0,0,83.8143,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,922.39,863.24,0.0 -256,2048,14336,1536,ck,0,0,96.0698,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,938.84,873.18,0.0 -256,2048,16384,512,ck,0,0,55.3461,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,620.82,1383.04,0.0 -256,2048,20480,1536,ck,0,0,132.3682,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,973.41,895.15,0.0 -256,2048,24576,1536,ck,0,0,153.4331,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1007.73,922.6,0.0 -256,2048,32768,512,ck,0,0,103.5934,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,663.36,1467.7,0.0 -256,2048,32768,1536,ck,0,0,196.9804,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1046.59,952.86,0.0 -256,2048,36864,7168,cktile,27,0,554.6038,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1951.54,775.18,0.0 -256,4096,64,7168,ck,8,0,21.3467,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,176.05,1421.45,0.0 -256,4096,128,7168,ck,18,0,23.7564,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,316.39,1318.64,0.0 -256,4096,512,7168,ck,18,0,44.3253,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,678.28,839.8,0.0 -256,4096,576,7168,ck,18,0,55.4301,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,610.19,689.29,0.0 -256,4096,1024,7168,ck,0,0,66.6998,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,901.5,676.0,0.0 -256,4096,1536,7168,cktile,11,0,90.0488,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1001.62,588.05,0.0 -256,4096,2112,7168,cktile,28,0,91.0323,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1362.34,678.88,0.0 -256,4096,2240,7168,cktile,27,0,91.0389,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1444.8,700.43,0.0 -256,4096,3072,1536,ck,0,0,42.2397,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,915.13,856.44,0.0 -256,4096,4096,512,ck,0,0,28.9648,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,593.13,1303.26,0.0 -256,4096,4096,7168,ck,0,0,209.3355,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1148.96,440.8,0.0 -256,4096,4608,7168,cktile,28,0,173.1867,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1562.38,578.21,0.0 -256,4096,7168,256,cktile,10,0,29.5555,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,508.62,2084.34,0.0 -256,4096,7168,512,ck,0,0,46.6829,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,644.02,1381.39,0.0 -256,4096,7168,2048,ck,0,0,120.1982,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1000.51,680.45,0.0 -256,4096,7168,2304,cktile,11,0,117.9301,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1147.22,717.99,0.0 -256,4096,7168,4096,ck,0,0,214.4016,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1121.81,489.07,0.0 -256,4096,7168,4608,ck,0,0,228.59,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1183.7,483.94,0.0 -256,4096,7168,16384,cktile,11,0,521.4023,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1845.16,466.57,0.0 -256,4096,7168,18432,ck,0,0,881.3469,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1228.04,302.19,0.0 -256,4096,8192,512,ck,0,0,56.3409,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,609.85,1302.79,0.0 -256,4096,8192,1536,ck,0,0,105.2588,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,979.29,816.87,0.0 -256,4096,9216,7168,cktile,28,0,296.1434,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1827.38,577.15,0.0 -256,4096,11264,1536,ck,0,0,142.0615,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,997.69,815.62,0.0 -256,4096,12288,1536,ck,0,0,152.3699,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1014.76,825.81,0.0 -256,4096,14336,1536,ck,0,0,176.1737,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1023.92,827.32,0.0 -256,4096,16384,512,ck,0,0,103.9428,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,661.13,1392.15,0.0 -256,4096,20480,1536,ck,0,0,238.3449,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1081.2,862.28,0.0 -256,4096,24576,1536,ck,0,0,282.6274,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1094.15,868.16,0.0 -256,4096,32768,512,ck,14,0,207.1159,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,663.58,1387.19,0.0 -256,4096,32768,1536,ck,0,0,374.8153,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1100.05,867.25,0.0 -256,4096,36864,7168,cktile,28,0,1082.0215,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2000.57,550.44,0.0 -256,6144,128,7168,ck,18,0,25.0579,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,449.93,1856.92,0.0 -256,6144,512,7168,ck,0,0,59.4018,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,759.19,909.09,0.0 -256,6144,1024,7168,ck,0,0,101.4656,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,888.92,630.39,0.0 -256,6144,2112,7168,cktile,13,0,162.6521,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_4,1143.7,523.39,0.0 -256,6144,2240,7168,cktile,13,0,167.5962,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_4,1177.23,522.81,0.0 -256,6144,3072,1536,ck,0,0,67.303,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,861.51,771.21,0.0 -256,6144,4096,512,ck,0,0,37.6611,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,684.26,1475.65,0.0 -256,6144,4096,7168,cktile,11,0,179.7043,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2007.62,688.53,0.0 -256,6144,4608,7168,ck,0,0,343.9947,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1179.89,388.65,0.0 -256,6144,7168,256,ck,17,0,51.0329,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,441.84,1792.73,0.0 -256,6144,7168,512,ck,0,0,70.5242,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,639.46,1345.58,0.0 -256,6144,7168,2048,ck,0,0,173.8221,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1037.78,663.57,0.0 -256,6144,7168,2304,ck,0,0,188.2806,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1077.84,630.71,0.0 -256,6144,7168,4096,ck,0,0,308.4794,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1169.53,462.29,0.0 -256,6144,7168,4608,cktile,27,0,241.4545,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1680.96,618.84,0.0 -256,6144,7168,16384,cktile,28,0,704.2201,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2049.23,434.78,0.0 -256,6144,7168,18432,cktile,26,0,788.6719,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2058.52,422.8,0.0 -256,6144,8192,512,ck,0,0,76.3779,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,674.8,1414.07,0.0 -256,6144,8192,1536,ck,0,0,147.4517,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1048.61,832.02,0.0 -256,6144,9216,7168,cktile,26,0,417.6534,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1943.59,534.77,0.0 -256,6144,11264,1536,ck,0,0,206.4783,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1029.65,799.85,0.0 -256,6144,12288,1536,ck,0,0,216.1487,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1073.0,829.55,0.0 -256,6144,14336,1536,ck,0,0,253.0583,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1069.25,820.44,0.0 -256,6144,16384,512,ck,0,0,141.5297,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,728.32,1504.0,0.0 -256,6144,20480,1536,ck,0,0,353.1291,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1094.63,828.46,0.0 -256,6144,24576,1536,ck,0,0,417.451,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1111.16,836.45,0.0 -256,6144,32768,512,ck,0,0,270.6067,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,761.84,1561.59,0.0 -256,6144,32768,1536,ck,0,0,552.4767,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1119.46,837.0,0.0 -256,6144,36864,7168,cktile,26,0,1548.0338,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2097.5,491.76,0.0 -256,8192,64,7168,ck,18,0,25.7151,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,292.29,2342.11,0.0 -256,8192,128,7168,ck,18,0,26.8171,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,560.55,2302.07,0.0 -256,8192,576,7168,ck,0,0,89.2735,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,757.74,809.72,0.0 -256,8192,1024,7168,ck,0,0,119.4233,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1007.0,693.65,0.0 -256,8192,1536,7168,cktile,11,0,158.9815,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1134.65,596.9,0.0 -256,8192,2112,7168,cktile,26,0,177.1163,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1400.4,612.38,0.0 -256,8192,2240,7168,cktile,27,0,177.0258,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1486.04,629.72,0.0 -256,8192,3072,1536,ck,0,0,82.3211,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,939.12,821.58,0.0 -256,8192,4096,512,ck,0,0,52.9601,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,648.79,1385.96,0.0 -256,8192,4096,7168,cktile,28,0,261.1689,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1841.86,594.21,0.0 -256,8192,4608,7168,cktile,28,0,292.7567,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1848.52,571.29,0.0 -256,8192,7168,256,cktile,10,0,54.7017,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,549.61,2218.81,0.0 -256,8192,7168,512,ck,0,0,85.2883,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,705.02,1469.19,0.0 -256,8192,7168,2048,ck,0,0,221.4286,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1086.21,672.44,0.0 -256,8192,7168,2304,cktile,11,0,199.6737,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1355.13,765.4,0.0 -256,8192,7168,4096,ck,0,0,400.2668,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1201.79,450.59,0.0 -256,8192,7168,4608,cktile,28,0,306.6077,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1765.01,613.88,0.0 -256,8192,7168,16384,cktile,26,0,880.476,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2185.35,419.2,0.0 -256,8192,7168,18432,cktile,28,0,993.9915,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2177.75,402.98,0.0 -256,8192,8192,512,ck,0,0,102.7856,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,668.57,1387.42,0.0 -256,8192,8192,1536,ck,0,0,195.7465,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1053.19,814.23,0.0 -256,8192,9216,7168,cktile,27,0,525.3431,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2060.24,524.94,0.0 -256,8192,11264,1536,ck,0,0,258.0178,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1098.64,831.08,0.0 -256,8192,12288,1536,ck,0,0,285.766,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1082.14,814.6,0.0 -256,8192,14336,1536,ck,0,0,326.789,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1104.01,824.64,0.0 -256,8192,16384,512,ck,0,0,186.7906,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,735.79,1504.46,0.0 -256,8192,20480,1536,ck,0,0,462.2371,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.0,821.19,0.0 -256,8192,24576,1536,ck,0,0,554.2004,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.98,817.37,0.0 -256,8192,32768,512,ck,0,0,369.0041,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,744.92,1511.75,0.0 -256,8192,32768,1536,ck,0,0,733.9936,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1123.49,817.15,0.0 -256,8192,36864,7168,cktile,28,0,2053.303,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2108.47,451.44,0.0 -256,10240,128,7168,ck,18,0,42.6021,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,441.07,1806.0,0.0 -256,10240,512,7168,ck,0,0,96.4596,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,779.21,907.7,0.0 -256,10240,1024,7168,cktile,28,0,93.1432,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1613.9,1091.99,0.0 -256,10240,2112,7168,cktile,27,0,182.2038,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1701.63,723.33,0.0 -256,10240,2240,7168,cktile,11,0,184.1157,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1786.02,735.04,0.0 -256,10240,3072,1536,ck,0,0,102.0072,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,947.35,817.21,0.0 -256,10240,4096,512,ck,0,0,65.3698,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,657.03,1395.54,0.0 -256,10240,4096,7168,ck,0,0,489.7941,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1227.65,381.07,0.0 -256,10240,4608,7168,cktile,27,0,346.7326,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1950.95,579.13,0.0 -256,10240,7168,256,ck,18,0,81.5831,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,460.65,1854.02,0.0 -256,10240,7168,512,ck,0,0,103.3089,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,727.55,1507.26,0.0 -256,10240,7168,2048,ck,0,0,271.7203,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1106.46,671.47,0.0 -256,10240,7168,2304,ck,0,0,291.7057,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1159.49,640.74,0.0 -256,10240,7168,4096,cktile,11,0,335.9717,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1789.72,649.17,0.0 -256,10240,7168,4608,cktile,27,0,363.5745,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1860.57,624.4,0.0 -256,10240,7168,16384,cktile,28,0,1085.6779,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2215.37,397.92,0.0 -256,10240,7168,18432,cktile,26,0,1209.4499,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2237.24,386.68,0.0 -256,10240,8192,512,ck,0,0,118.4046,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,725.47,1496.64,0.0 -256,10240,8192,1536,ck,0,0,238.501,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1080.49,822.15,0.0 -256,10240,9216,7168,cktile,11,0,664.1018,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2037.21,494.21,0.0 -256,10240,11264,1536,ck,0,0,323.467,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1095.43,815.28,0.0 -256,10240,12288,1536,ck,0,0,353.3679,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1093.89,810.09,0.0 -256,10240,14336,1536,ck,0,0,409.667,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1100.82,808.83,0.0 -256,10240,16384,512,ck,0,0,225.2848,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,762.58,1549.93,0.0 -256,10240,20480,1536,ck,0,0,576.4526,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1117.6,809.46,0.0 -256,10240,24576,1536,ck,0,0,688.7802,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.41,808.38,0.0 -256,10240,32768,512,ck,0,0,442.473,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,776.54,1566.44,0.0 -256,10240,32768,1536,ck,0,0,917.7738,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1123.14,803.19,0.0 -256,10240,36864,7168,cktile,26,0,2574.9353,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2101.67,424.33,0.0 -256,12288,128,7168,ck,18,0,44.9391,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,501.76,2050.41,0.0 -256,12288,512,7168,ck,0,0,102.6049,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,879.04,1016.85,0.0 -256,12288,1024,7168,cktile,28,0,97.9679,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1841.3,1230.88,0.0 -256,12288,2112,7168,cktile,26,0,259.1603,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1435.6,598.56,0.0 -256,12288,2240,7168,cktile,26,0,261.1259,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1511.15,609.62,0.0 -256,12288,3072,1536,ck,0,0,119.017,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,974.35,832.57,0.0 -256,12288,4096,512,ck,0,0,77.2939,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,666.8,1410.87,0.0 -256,12288,4096,7168,cktile,11,0,352.7815,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2045.33,618.24,0.0 -256,12288,4608,7168,cktile,11,0,434.5259,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1868.13,539.34,0.0 -256,12288,7168,256,cktile,6,0,94.972,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x32_intrawave_0x0x0x0_1,474.85,1907.31,0.0 -256,12288,7168,512,ck,0,0,124.0238,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,727.23,1500.7,0.0 -256,12288,7168,2048,ck,0,0,316.8378,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1138.68,681.76,0.0 -256,12288,7168,2304,ck,0,0,339.0438,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1197.11,651.8,0.0 -256,12288,7168,4096,cktile,27,0,395.2387,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1825.62,647.34,0.0 -256,12288,7168,4608,cktile,11,0,425.8071,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1906.38,624.26,0.0 -256,12288,7168,16384,cktile,28,0,1279.8588,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2255.11,386.71,0.0 -256,12288,7168,18432,cktile,11,0,1389.9798,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2336.0,384.73,0.0 -256,12288,8192,512,ck,0,0,138.724,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,743.05,1526.86,0.0 -256,12288,8192,1536,ck,0,0,284.452,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1087.13,818.36,0.0 -256,12288,9216,7168,cktile,11,0,787.3284,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2062.03,483.45,0.0 -256,12288,11264,1536,ck,0,0,383.3207,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1109.26,816.55,0.0 -256,12288,12288,1536,ck,0,0,419.245,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1106.41,810.36,0.0 -256,12288,14336,1536,ck,0,0,485.2263,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.29,810.38,0.0 -256,12288,16384,512,ck,0,0,271.3271,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,759.82,1538.12,0.0 -256,12288,20480,1536,ck,0,0,688.6851,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.57,803.92,0.0 -256,12288,24576,1536,ck,0,0,822.4186,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1128.03,803.24,0.0 -256,12288,32768,512,ck,0,0,530.0213,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,777.93,1562.91,0.0 -256,12288,32768,1536,ck,0,0,1098.3661,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1126.17,796.19,0.0 -256,12288,36864,7168,cktile,26,0,3042.4958,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2134.43,413.57,0.0 -256,14336,128,7168,ck,18,0,45.8225,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,574.1,2342.69,0.0 -256,14336,512,7168,ck,0,0,117.1634,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,898.12,1033.69,0.0 -256,14336,1024,7168,ck,0,0,203.2423,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1035.48,686.18,0.0 -256,14336,2112,7168,cktile,28,0,264.6636,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1640.04,674.27,0.0 -256,14336,2240,7168,cktile,27,0,265.5412,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1733.69,689.32,0.0 -256,14336,3072,1536,ck,0,0,140.8499,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,960.54,815.19,0.0 -256,14336,4096,512,ck,0,0,89.2808,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,673.49,1421.11,0.0 -256,14336,4096,7168,cktile,28,0,431.6564,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1950.19,578.15,0.0 -256,14336,4608,7168,cktile,28,0,499.0381,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1897.73,536.86,0.0 -256,14336,7168,256,ck,17,0,110.8353,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,474.7,1903.96,0.0 -256,14336,7168,512,ck,0,0,143.1255,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,735.21,1512.87,0.0 -256,14336,7168,2048,ck,0,0,373.5706,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1126.71,668.04,0.0 -256,14336,7168,2304,ck,0,0,401.7208,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1178.73,634.93,0.0 -256,14336,7168,4096,cktile,11,0,451.527,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1864.37,650.24,0.0 -256,14336,7168,4608,cktile,26,0,491.627,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1926.34,619.6,0.0 -256,14336,7168,16384,cktile,26,0,1445.403,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2329.63,385.94,0.0 -256,14336,7168,18432,cktile,26,0,1627.8742,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2327.06,369.74,0.0 -256,14336,8192,512,ck,0,0,160.1493,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,750.92,1538.66,0.0 -256,14336,8192,1536,ck,0,0,330.5132,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1091.57,815.35,0.0 -256,14336,9216,7168,cktile,11,0,886.8712,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2135.69,488.3,0.0 -256,14336,11264,1536,ck,0,0,446.7232,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1110.46,810.98,0.0 -256,14336,12288,1536,ck,0,0,485.1467,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.47,810.51,0.0 -256,14336,14336,1536,ck,0,0,568.5764,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1110.42,800.39,0.0 -256,14336,16384,512,ck,0,0,311.9278,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,771.07,1556.42,0.0 -256,14336,20480,1536,ck,0,0,805.6795,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1119.48,795.2,0.0 -256,14336,24576,1536,ck,0,0,961.2433,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1125.97,795.23,0.0 -256,14336,32768,512,ck,0,0,610.1813,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,788.35,1579.27,0.0 -256,14336,32768,1536,ck,0,0,1286.8515,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1121.43,786.32,0.0 -256,14336,36864,7168,cktile,11,0,3573.2083,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2120.31,398.51,0.0 -256,16384,64,7168,ck,18,0,32.1288,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,467.88,3734.86,0.0 -256,16384,128,7168,ck,18,0,49.2013,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,611.06,2490.83,0.0 -256,16384,576,7168,ck,0,0,153.6456,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,880.54,914.08,0.0 -256,16384,1024,7168,ck,0,0,216.1688,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1112.64,732.46,0.0 -256,16384,1536,7168,cktile,11,0,256.6513,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1405.71,696.6,0.0 -256,16384,2112,7168,cktile,11,0,306.4497,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1618.76,658.46,0.0 -256,16384,2240,7168,cktile,11,0,307.9861,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1708.3,671.77,0.0 -256,16384,3072,1536,ck,0,0,155.5478,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,994.03,839.28,0.0 -256,16384,4096,512,ck,0,0,101.0461,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,680.08,1432.05,0.0 -256,16384,4096,7168,cktile,27,0,511.7685,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1879.9,549.11,0.0 -256,16384,4608,7168,cktile,26,0,520.5986,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2079.01,579.07,0.0 -256,16384,6144,1536,ck,0,0,292.1834,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1058.37,807.47,0.0 -256,16384,7168,256,cktile,10,0,101.2722,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,593.74,2378.84,0.0 -256,16384,7168,512,ck,0,0,164.8841,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,729.36,1497.66,0.0 -256,16384,7168,2048,ck,0,0,421.5322,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1141.16,671.63,0.0 -256,16384,7168,2304,cktile,11,0,371.4356,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1456.96,778.45,0.0 -256,16384,7168,4096,cktile,27,0,520.6164,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1847.95,636.46,0.0 -256,16384,7168,4608,cktile,26,0,560.4838,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1931.07,612.7,0.0 -256,16384,7168,16384,cktile,28,0,1682.9186,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2286.68,368.86,0.0 -256,16384,7168,18432,cktile,11,0,1877.5648,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2305.82,356.31,0.0 -256,16384,8192,512,ck,0,0,187.5611,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,732.77,1498.28,0.0 -256,16384,8192,1536,ck,0,0,379.4481,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1086.62,806.92,0.0 -256,16384,9216,7168,cktile,26,0,1028.9257,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2103.81,471.84,0.0 -256,16384,11264,1536,ck,0,0,507.1159,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1117.96,811.58,0.0 -256,16384,12288,1536,ck,14,0,631.8107,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,978.89,707.01,0.0 -256,16384,14336,1536,ck,0,0,643.287,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1121.67,803.6,0.0 -256,16384,16384,512,ck,0,0,353.2954,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,778.04,1567.1,0.0 -256,16384,20480,1536,ck,0,0,912.096,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1130.14,797.85,0.0 -256,16384,24576,1536,ck,0,0,1102.0997,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.36,787.79,0.0 -256,16384,32768,512,ck,0,0,709.5617,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,774.78,1548.71,0.0 -256,16384,32768,1536,ck,0,0,1460.4078,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1129.32,786.93,0.0 -256,16384,36864,7168,cktile,26,0,4061.1773,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2132.06,391.42,0.0 -256,20480,576,7168,cktile,13,0,165.4454,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_4,1022.18,1054.86,0.0 -256,20480,1536,7168,cktile,11,0,295.5655,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1525.79,746.79,0.0 -256,20480,3072,1536,ck,0,0,186.8642,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1034.3,866.97,0.0 -256,20480,4096,512,ck,0,0,119.3033,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,720.01,1511.74,0.0 -256,20480,4608,7168,cktile,27,0,699.6189,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1933.79,526.82,0.0 -256,20480,7168,256,cktile,10,0,125.313,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,599.79,2399.43,0.0 -256,20480,7168,2048,ck,0,0,525.1612,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1144.97,666.89,0.0 -256,20480,7168,2304,cktile,11,0,470.5953,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1437.45,759.26,0.0 -256,32768,64,7168,ck,18,0,58.767,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,511.59,4076.0,0.0 -256,32768,128,7168,ck,0,0,74.5115,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,806.98,3277.17,0.0 -256,32768,512,7168,ck,0,0,221.9247,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1083.78,1226.12,0.0 -256,32768,576,7168,cktile,11,0,223.8974,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1208.51,1236.1,0.0 -256,32768,1024,7168,cktile,26,0,271.8378,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1769.57,1137.92,0.0 -256,32768,1536,7168,cktile,11,0,384.6335,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1875.95,901.0,0.0 -256,32768,2112,7168,cktile,27,0,534.9146,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1854.76,726.16,0.0 -256,32768,2240,7168,cktile,26,0,541.7648,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1942.29,734.15,0.0 -256,32768,3072,1536,ck,0,0,287.9805,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1073.81,890.26,0.0 -256,32768,4096,512,ck,0,0,185.2578,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,741.88,1550.86,0.0 -256,32768,4096,7168,cktile,28,0,951.6322,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2021.94,559.75,0.0 -256,32768,4608,7168,cktile,28,0,1052.2794,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2057.12,541.59,0.0 -256,32768,6144,1536,ck,0,0,556.7669,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1110.83,830.55,0.0 -256,32768,7168,256,ck,17,0,239.5893,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,501.94,2003.37,0.0 -256,32768,7168,512,ck,0,0,312.6371,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,769.32,1567.98,0.0 -256,32768,7168,2048,ck,0,0,839.2502,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1146.35,657.19,0.0 -256,32768,7168,2304,ck,0,0,901.4207,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1200.7,623.21,0.0 -256,32768,7168,4096,cktile,28,0,1008.0074,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1908.86,628.31,0.0 -256,32768,7168,4608,cktile,26,0,1082.5675,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1999.56,603.92,0.0 -256,32768,7168,16384,cktile,26,0,3378.3991,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2278.17,332.72,0.0 -256,32768,7168,18432,cktile,26,0,3742.2158,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2313.78,322.23,0.0 -256,32768,8192,512,ck,0,0,360.9476,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,761.55,1545.49,0.0 -256,32768,8192,1536,ck,0,0,737.4624,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1118.2,813.31,0.0 -256,32768,9216,7168,cktile,27,0,1959.5649,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2209.33,461.8,0.0 -256,32768,11264,1536,ck,0,0,1005.4332,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1127.74,801.48,0.0 -256,32768,12288,1536,ck,0,0,1102.1362,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.32,793.47,0.0 -256,32768,14336,1536,ck,0,0,1285.2161,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.85,787.32,0.0 -256,32768,16384,512,ck,0,0,696.8117,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,788.96,1577.05,0.0 -256,32768,20480,1536,ck,0,0,1826.5515,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1128.68,779.59,0.0 -256,32768,24576,1536,ck,0,0,2191.9056,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1128.65,774.98,0.0 -256,32768,32768,512,ck,0,0,1385.222,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,793.74,1574.5,0.0 -256,32768,32768,1536,ck,0,0,2918.4955,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1130.22,770.31,0.0 -256,32768,36864,7168,cktile,28,0,8025.7332,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2157.72,363.21,0.0 -256,65536,64,7168,ck,18,0,116.7699,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,514.94,4098.74,0.0 -256,65536,128,7168,ck,0,0,150.0461,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,801.48,3248.71,0.0 -256,65536,512,7168,cktile,28,0,277.0424,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1736.33,1951.11,0.0 -256,65536,576,7168,cktile,27,0,381.3612,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1419.04,1440.6,0.0 -256,65536,1024,7168,cktile,27,0,535.6356,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1796.13,1141.3,0.0 -256,65536,1536,7168,cktile,27,0,708.5099,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2036.82,962.72,0.0 -256,65536,2112,7168,cktile,11,0,1048.9318,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1891.71,726.19,0.0 -256,65536,2240,7168,cktile,11,0,1052.852,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1998.89,740.29,0.0 -256,65536,3072,1536,ck,0,0,566.2254,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1092.28,897.23,0.0 -256,65536,4096,512,ck,0,0,357.1801,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,769.58,1602.9,0.0 -256,65536,4096,7168,cktile,26,0,1875.2217,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2052.18,552.46,0.0 -256,65536,4608,7168,cktile,27,0,2093.6777,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2067.81,528.63,0.0 -256,65536,6144,1536,ck,0,0,1111.303,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1113.06,823.72,0.0 -256,65536,7168,256,ck,17,0,469.6057,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,512.17,2040.3,0.0 -256,65536,7168,512,ck,0,0,621.2778,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,774.27,1572.16,0.0 -256,65536,7168,2048,ck,0,0,1669.8174,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1152.31,651.82,0.0 -256,65536,7168,2304,ck,0,0,1801.7363,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1201.43,614.43,0.0 -256,65536,7168,4096,cktile,27,0,1951.4105,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1972.06,634.06,0.0 -256,65536,7168,4608,cktile,11,0,2159.6204,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2004.67,590.17,0.0 -256,65536,7168,9216,cktile,11,0,3868.2373,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2238.4,416.1,0.0 -256,65536,7168,16384,cktile,26,0,6897.8894,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2231.58,308.89,0.0 -256,65536,7168,18432,cktile,28,0,7476.3659,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2316.27,304.91,0.0 -256,65536,8192,512,ck,0,0,704.0693,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,780.83,1578.67,0.0 -256,65536,8192,1536,ck,0,0,1473.86,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1119.01,805.36,0.0 -256,65536,9216,7168,cktile,27,0,3989.8626,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2170.16,437.05,0.0 -256,65536,11264,1536,ck,0,0,2012.9804,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1126.56,792.04,0.0 -256,65536,12288,1536,ck,0,0,2193.6639,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1127.75,788.7,0.0 -256,65536,14336,1536,ck,0,0,2557.2449,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1128.64,782.77,0.0 -256,65536,16384,512,ck,0,0,1381.4369,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,795.92,1584.89,0.0 -256,65536,20480,1536,ck,0,0,3667.0687,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1124.38,768.05,0.0 -256,65536,24576,1536,ck,0,0,4395.2438,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1125.72,764.38,0.0 -256,98304,7168,4096,cktile,28,0,2962.6601,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1948.4,621.5,0.0 -256,98304,7168,4608,cktile,26,0,3189.2174,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2036.23,594.28,0.0 -256,98304,7168,9216,cktile,27,0,5713.6008,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2273.17,416.78,0.0 -256,98304,7168,16384,cktile,26,0,10375.8827,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2225.33,302.37,0.0 -256,98304,7168,18432,cktile,27,0,11252.7048,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2308.42,298.0,0.0 -256,98304,8192,512,ck,0,0,1045.7319,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,788.57,1592.32,0.0 -256,98304,8192,1536,ck,0,0,2210.3452,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1119.24,802.68,0.0 -256,98304,9216,7168,cktile,26,0,5845.0573,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2222.05,441.85,0.0 -256,98304,11264,1536,ck,0,0,3016.2181,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1127.77,790.03,0.0 -256,98304,12288,1536,ck,0,0,3304.3299,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1123.03,782.55,0.0 -256,98304,14336,1536,ck,0,0,3862.1384,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1120.97,774.59,0.0 -256,98304,16384,512,ck,0,0,2069.8943,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,796.79,1584.6,0.0 -256,98304,20480,1536,ck,0,0,5509.1484,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.63,764.0,0.0 -256,131072,128,7168,ck,0,0,293.0508,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,820.74,3323.64,0.0 -256,131072,512,7168,cktile,11,0,545.9348,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1762.25,1973.52,0.0 -256,131072,576,7168,cktile,27,0,712.3688,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1519.34,1536.63,0.0 -256,131072,1024,7168,cktile,26,0,982.8131,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1957.79,1236.55,0.0 -256,131072,1536,7168,cktile,27,0,1418.7228,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2034.38,953.81,0.0 -256,131072,2112,7168,cktile,28,0,2108.0457,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1882.57,715.5,0.0 -256,131072,2240,7168,cktile,27,0,2103.3064,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2001.17,733.5,0.0 -256,131072,3072,1536,ck,0,0,1129.3765,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1095.25,895.5,0.0 -256,131072,4096,512,ck,0,0,712.1968,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,771.92,1604.82,0.0 -256,131072,4096,7168,cktile,27,0,3725.4959,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2065.92,548.28,0.0 -256,131072,6144,1536,ck,0,0,2213.8353,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1117.47,822.72,0.0 -256,131072,7168,256,ck,17,0,935.1344,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,514.4,2047.23,0.0 -256,131072,7168,512,ck,0,0,1234.1151,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,779.56,1579.94,0.0 -256,131072,7168,2048,ck,0,0,3347.1523,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1149.72,645.97,0.0 -256,131072,7168,4096,cktile,12,0,3923.2429,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,1961.79,623.28,0.0 -256,131072,7168,4608,cktile,26,0,4209.0554,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2057.15,597.77,0.0 -256,131072,7168,9216,cktile,28,0,7568.5865,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2288.05,416.6,0.0 -256,131072,7168,16384,cktile,27,0,13692.8484,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2248.35,302.64,0.0 -256,131072,7168,18432,cktile,26,0,15254.4732,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2270.46,290.22,0.0 -256,131072,8192,512,ck,0,0,1408.0054,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,780.9,1575.84,0.0 -256,131072,8192,1536,ck,0,0,2939.9522,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1121.97,803.21,0.0 -256,131072,9216,7168,cktile,26,0,7924.4831,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2185.29,431.76,0.0 -256,131072,12288,1536,ck,0,0,4404.9134,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1123.25,781.27,0.0 -256,131072,14336,1536,ck,0,0,5136.8917,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1123.72,775.07,0.0 +gfx,cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,1,128,7168,ck,8,0,15.3942,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.12,60.08,0.0 +gfx950,256,1,512,7168,ck,8,0,22.2417,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.33,165.37,0.0 +gfx950,256,1,1024,7168,ck,8,0,22.7838,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.64,322.56,0.0 +gfx950,256,1,2112,7168,ck,8,0,22.4396,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.35,675.15,0.0 +gfx950,256,1,2240,7168,ck,8,0,22.4289,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.43,716.4,0.0 +gfx950,256,1,3072,1536,ck,8,0,6.5611,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.44,720.35,0.0 +gfx950,256,1,4096,512,cktile,1,0,3.3013,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_2,1.27,637.89,0.0 +gfx950,256,1,4096,7168,ck,8,0,22.7643,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.58,1290.42,0.0 +gfx950,256,1,4608,7168,ck,8,0,22.7223,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.91,1454.37,0.0 +gfx950,256,1,7168,256,ck,8,0,2.9182,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.26,633.82,0.0 +gfx950,256,1,7168,512,ck,13,0,4.049,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,1.81,910.07,0.0 +gfx950,256,1,7168,2048,ck,8,0,7.9966,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.67,1837.84,0.0 +gfx950,256,1,7168,2304,ck,8,0,9.5203,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.47,1736.47,0.0 +gfx950,256,1,7168,4096,ck,8,0,14.5736,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.03,2015.88,0.0 +gfx950,256,1,7168,4608,ck,8,0,15.7127,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.2,2103.34,0.0 +gfx950,256,1,7168,16384,ck,8,0,50.6751,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.64,2318.13,0.0 +gfx950,256,1,7168,18432,ck,8,0,57.4061,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.6,2302.08,0.0 +gfx950,256,1,8192,512,ck,13,0,4.0954,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,2.05,1028.28,0.0 +gfx950,256,1,8192,1536,ck,8,0,6.2908,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.0,2003.06,0.0 +gfx950,256,1,9216,7168,ck,8,0,24.1893,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.46,2732.03,0.0 +gfx950,256,1,11264,1536,ck,8,0,7.5534,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.58,2293.74,0.0 +gfx950,256,1,12288,1536,ck,8,0,7.7799,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.85,2429.4,0.0 +gfx950,256,1,14336,1536,ck,8,0,8.0858,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.45,2727.04,0.0 +gfx950,256,1,16384,512,ck,8,0,4.6738,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.59,1801.94,0.0 +gfx950,256,1,20480,1536,ck,7,0,8.6493,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,7.27,3641.89,0.0 +gfx950,256,1,24576,1536,ck,8,0,9.3079,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.11,4061.0,0.0 +gfx950,256,1,32768,512,ck,8,0,5.2866,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.35,3186.03,0.0 +gfx950,256,1,32768,1536,ck,8,0,11.0399,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.12,4565.14,0.0 +gfx950,256,1,36864,7168,ck,8,0,51.5672,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.25,5125.78,0.0 +gfx950,256,2,128,7168,ck,8,0,15.3762,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.24,60.64,0.0 +gfx950,256,2,512,7168,ck,8,0,22.2604,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.66,165.6,0.0 +gfx950,256,2,1024,7168,ck,8,0,22.8521,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.28,322.0,0.0 +gfx950,256,2,2112,7168,ck,8,0,22.4493,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.7,675.37,0.0 +gfx950,256,2,2240,7168,ck,8,0,22.4022,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.87,717.77,0.0 +gfx950,256,2,3072,1536,ck,8,0,6.4642,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.92,732.33,0.0 +gfx950,256,2,4096,512,ck,8,0,3.401,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.47,621.75,0.0 +gfx950,256,2,4096,7168,ck,8,0,22.7895,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.15,1289.67,0.0 +gfx950,256,2,4608,7168,ck,8,0,22.751,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.81,1453.25,0.0 +gfx950,256,2,7168,256,ck,8,0,2.908,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.52,641.06,0.0 +gfx950,256,2,7168,512,ck,13,0,4.0902,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,3.59,904.53,0.0 +gfx950,256,2,7168,2048,ck,8,0,8.4117,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.98,1749.09,0.0 +gfx950,256,2,7168,2304,ck,8,0,8.8052,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.5,1879.38,0.0 +gfx950,256,2,7168,4096,ck,8,0,14.5727,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.06,2017.26,0.0 +gfx950,256,2,7168,4608,ck,8,0,15.7396,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.39,2100.94,0.0 +gfx950,256,2,7168,16384,ck,8,0,50.6878,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.27,2318.15,0.0 +gfx950,256,2,7168,18432,ck,8,0,57.2045,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.24,2310.76,0.0 +gfx950,256,2,8192,512,ck,13,0,4.117,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,4.08,1026.98,0.0 +gfx950,256,2,8192,1536,ck,8,0,7.3302,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.87,1721.47,0.0 +gfx950,256,2,9216,7168,ck,8,0,24.268,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.89,2724.22,0.0 +gfx950,256,2,11264,1536,ck,8,0,6.4805,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.68,2677.21,0.0 +gfx950,256,2,12288,1536,ck,8,0,6.5398,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.54,2894.06,0.0 +gfx950,256,2,14336,1536,ck,8,0,8.0887,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.89,2729.8,0.0 +gfx950,256,2,16384,512,ck,8,0,4.7294,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.09,1787.79,0.0 +gfx950,256,2,20480,1536,ck,7,0,8.6736,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,14.51,3636.58,0.0 +gfx950,256,2,24576,1536,ck,8,0,9.316,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.21,4062.91,0.0 +gfx950,256,2,32768,512,ck,8,0,5.3243,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.6,3175.88,0.0 +gfx950,256,2,32768,1536,ck,8,0,11.0733,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.18,4557.43,0.0 +gfx950,256,2,36864,7168,ck,8,0,51.4438,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.55,5139.65,0.0 +gfx950,256,4,128,7168,ck,8,0,15.4615,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.47,61.26,0.0 +gfx950,256,4,512,7168,ck,8,0,22.3068,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.32,165.99,0.0 +gfx950,256,4,1024,7168,ck,8,0,22.8322,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.57,323.09,0.0 +gfx950,256,4,2112,7168,ck,8,0,22.473,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.39,675.67,0.0 +gfx950,256,4,2240,7168,ck,8,0,22.434,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.73,717.79,0.0 +gfx950,256,4,3072,1536,ck,8,0,6.7718,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.57,701.34,0.0 +gfx950,256,4,4096,512,cktile,0,0,3.2399,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,5.18,658.04,0.0 +gfx950,256,4,4096,7168,ck,8,0,22.7401,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.33,1293.82,0.0 +gfx950,256,4,4608,7168,ck,8,0,22.8359,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.57,1449.28,0.0 +gfx950,256,4,7168,256,ck,8,0,2.9298,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.01,646.25,0.0 +gfx950,256,4,7168,512,ck,13,0,4.0665,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,7.22,917.11,0.0 +gfx950,256,4,7168,2048,ck,8,0,8.3038,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.14,1775.77,0.0 +gfx950,256,4,7168,2304,ck,8,0,8.8224,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.98,1879.49,0.0 +gfx950,256,4,7168,4096,ck,8,0,14.5954,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.09,2016.65,0.0 +gfx950,256,4,7168,4608,ck,8,0,15.7107,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.82,2107.22,0.0 +gfx950,256,4,7168,16384,ck,8,0,50.7328,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.52,2317.31,0.0 +gfx950,256,4,7168,18432,ck,8,0,57.5155,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.38,2299.41,0.0 +gfx950,256,4,8192,512,ck,13,0,4.1178,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,8.15,1034.99,0.0 +gfx950,256,4,8192,1536,ck,8,0,6.5931,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,15.27,1919.37,0.0 +gfx950,256,4,9216,7168,ck,8,0,24.2612,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.78,2727.1,0.0 +gfx950,256,4,11264,1536,ck,8,0,6.5359,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.18,2661.88,0.0 +gfx950,256,4,12288,1536,ck,8,0,7.7422,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.5,2451.35,0.0 +gfx950,256,4,14336,1536,ck,8,0,8.0317,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.93,2756.69,0.0 +gfx950,256,4,16384,512,ck,8,0,4.6944,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.3,1815.3,0.0 +gfx950,256,4,20480,1536,ck,7,0,8.7372,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,28.8,3619.84,0.0 +gfx950,256,4,24576,1536,ck,8,0,9.41,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,32.09,4033.1,0.0 +gfx950,256,4,32768,512,ck,8,0,5.3284,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.19,3198.22,0.0 +gfx950,256,4,32768,1536,ck,8,0,11.1225,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.2,4549.33,0.0 +gfx950,256,4,36864,7168,ck,8,0,51.9106,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.72,5096.55,0.0 +gfx950,256,8,128,7168,ck,8,0,15.5627,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.94,62.77,0.0 +gfx950,256,8,512,7168,ck,8,0,22.3452,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.63,167.17,0.0 +gfx950,256,8,1024,7168,ck,8,0,22.9843,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.11,322.56,0.0 +gfx950,256,8,2112,7168,ck,8,0,22.4589,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.79,678.13,0.0 +gfx950,256,8,2240,7168,ck,8,0,22.4802,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.43,718.39,0.0 +gfx950,256,8,3072,1536,ck,8,0,6.482,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.65,737.43,0.0 +gfx950,256,8,4096,512,ck,8,0,3.4272,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.79,632.23,0.0 +gfx950,256,8,4096,7168,ck,8,0,22.9113,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.5,1286.83,0.0 +gfx950,256,8,4608,7168,ck,8,0,22.808,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.17,1453.93,0.0 +gfx950,256,8,7168,256,ck,8,0,2.9146,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.07,669.64,0.0 +gfx950,256,8,7168,512,ck,13,0,4.1122,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,14.28,921.36,0.0 +gfx950,256,8,7168,2048,ck,8,0,8.1857,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,28.69,1809.39,0.0 +gfx950,256,8,7168,2304,ck,8,0,8.8267,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,29.94,1886.12,0.0 +gfx950,256,8,7168,4096,ck,8,0,14.65,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,32.07,2014.17,0.0 +gfx950,256,8,7168,4608,ck,8,0,15.796,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,33.46,2100.64,0.0 +gfx950,256,8,7168,16384,ck,8,0,50.8429,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.96,2314.7,0.0 +gfx950,256,8,7168,18432,ck,8,0,57.4158,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.82,2305.68,0.0 +gfx950,256,8,8192,512,ck,13,0,4.1533,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,16.16,1042.42,0.0 +gfx950,256,8,8192,1536,ck,8,0,6.4583,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,31.17,1970.53,0.0 +gfx950,256,8,9216,7168,ck,8,0,24.3102,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.48,2725.81,0.0 +gfx950,256,8,11264,1536,ck,8,0,6.624,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.79,2641.0,0.0 +gfx950,256,8,12288,1536,ck,8,0,7.0261,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,42.98,2716.05,0.0 +gfx950,256,8,14336,1536,ck,8,0,8.0884,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.56,2752.31,0.0 +gfx950,256,8,16384,512,ck,8,0,4.6609,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,28.8,1856.9,0.0 +gfx950,256,8,20480,1536,ck,7,0,8.8663,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,56.77,3586.3,0.0 +gfx950,256,8,24576,1536,ck,8,0,9.4837,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,63.69,4023.14,0.0 +gfx950,256,8,32768,512,ck,8,0,5.3293,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.37,3247.26,0.0 +gfx950,256,8,32768,1536,ck,8,0,11.1791,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,72.04,4550.3,0.0 +gfx950,256,8,36864,7168,ck,8,0,52.7222,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,80.19,5024.23,0.0 +gfx950,256,16,64,7168,ck,8,0,13.753,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.07,41.84,0.0 +gfx950,256,16,128,7168,ck,8,0,13.8217,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.12,74.98,0.0 +gfx950,256,16,512,7168,ck,8,0,20.9785,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.6,181.19,0.0 +gfx950,256,16,576,7168,ck,8,0,21.1695,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.24,201.32,0.0 +gfx950,256,16,1024,7168,ck,8,0,21.4879,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.93,348.45,0.0 +gfx950,256,16,1536,7168,ck,8,0,21.2635,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.57,525.5,0.0 +gfx950,256,16,2112,7168,ck,8,0,21.0289,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.04,728.57,0.0 +gfx950,256,16,2240,7168,ck,8,0,21.0862,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,24.37,770.3,0.0 +gfx950,256,16,3072,1536,ck,8,0,6.5418,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.08,740.08,0.0 +gfx950,256,16,4096,512,cktile,3,0,3.3118,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x64_intrawave_0x1x0_2,20.26,675.29,0.0 +gfx950,256,16,4096,7168,ck,8,0,21.5063,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.69,1376.61,0.0 +gfx950,256,16,4608,7168,ck,8,0,21.4253,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.33,1553.88,0.0 +gfx950,256,16,7168,256,ck,8,0,2.9668,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.79,697.21,0.0 +gfx950,256,16,7168,512,ck,8,0,4.1538,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,28.27,940.73,0.0 +gfx950,256,16,7168,2048,ck,8,0,8.5525,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.93,1747.12,0.0 +gfx950,256,16,7168,2304,ck,8,0,8.5986,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,61.46,1951.63,0.0 +gfx950,256,16,7168,4096,ck,8,0,14.2602,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,65.88,2079.57,0.0 +gfx950,256,16,7168,4608,ck,8,0,14.92,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,70.84,2234.13,0.0 +gfx950,256,16,7168,16384,ck,8,0,49.2174,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,76.36,2396.15,0.0 +gfx950,256,16,7168,18432,ck,8,0,54.4636,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,77.63,2435.48,0.0 +gfx950,256,16,8192,512,ck,8,0,4.1546,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,32.31,1074.63,0.0 +gfx950,256,16,8192,1536,ck,8,0,6.991,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,57.6,1840.89,0.0 +gfx950,256,16,9216,7168,ck,8,0,23.3394,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,90.57,2847.97,0.0 +gfx950,256,16,11264,1536,ck,8,0,6.5361,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,84.71,2705.98,0.0 +gfx950,256,16,12288,1536,ck,8,0,7.8318,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,77.12,2463.31,0.0 +gfx950,256,16,14336,1536,ck,8,0,8.0559,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,87.47,2793.41,0.0 +gfx950,256,16,16384,512,ck,8,0,4.6334,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,57.93,1925.39,0.0 +gfx950,256,16,20480,1536,ck,7,0,9.1661,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,109.82,3506.09,0.0 +gfx950,256,16,24576,1536,ck,8,0,9.7004,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,124.53,3975.07,0.0 +gfx950,256,16,32768,512,ck,7,0,5.5119,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,97.4,3235.54,0.0 +gfx950,256,16,32768,1536,ck,8,0,11.4461,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,140.71,4491.03,0.0 +gfx950,256,16,36864,7168,ck,8,0,53.9674,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,156.68,4920.29,0.0 +gfx950,256,32,64,7168,ck,8,0,13.8569,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.12,49.96,0.0 +gfx950,256,32,128,7168,ck,8,0,14.288,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.11,80.84,0.0 +gfx950,256,32,512,7168,ck,8,0,20.8284,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.28,188.79,0.0 +gfx950,256,32,576,7168,ck,8,0,20.9741,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.6,209.54,0.0 +gfx950,256,32,1024,7168,ck,8,0,21.3011,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.05,358.43,0.0 +gfx950,256,32,1536,7168,ck,8,0,21.328,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,33.04,531.59,0.0 +gfx950,256,32,2112,7168,ck,8,0,20.8386,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.49,743.97,0.0 +gfx950,256,32,2240,7168,ck,8,0,20.8898,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.19,786.46,0.0 +gfx950,256,32,3072,1536,ck,8,0,6.5986,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,45.77,752.33,0.0 +gfx950,256,32,4096,512,ck,13,0,3.5718,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,37.58,665.12,0.0 +gfx950,256,32,4096,7168,ck,8,0,21.3382,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,88.06,1398.98,0.0 +gfx950,256,32,4608,7168,ck,8,0,21.3536,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,99.0,1571.37,0.0 +gfx950,256,32,7168,256,ck,13,0,3.239,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,36.26,710.7,0.0 +gfx950,256,32,7168,512,ck,13,0,4.2078,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,55.82,985.11,0.0 +gfx950,256,32,7168,2048,ck,8,0,8.0391,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,116.87,1891.3,0.0 +gfx950,256,32,7168,2304,ck,8,0,8.7013,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,121.47,1959.2,0.0 +gfx950,256,32,7168,4096,ck,8,0,14.1521,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,132.78,2116.29,0.0 +gfx950,256,32,7168,4608,ck,8,0,14.9953,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,140.97,2243.13,0.0 +gfx950,256,32,7168,16384,ck,8,0,48.4226,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,155.22,2445.63,0.0 +gfx950,256,32,7168,18432,ck,8,0,54.0238,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,156.52,2465.01,0.0 +gfx950,256,32,8192,512,ck,13,0,4.2239,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,63.55,1121.0,0.0 +gfx950,256,32,8192,1536,ck,8,0,6.8782,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,117.08,1912.76,0.0 +gfx950,256,32,9216,7168,ck,7,0,25.0083,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,169.06,2674.29,0.0 +gfx950,256,32,11264,1536,ck,18,0,8.6616,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,127.84,2086.4,0.0 +gfx950,256,32,12288,1536,ck,18,0,8.3515,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,144.64,2360.05,0.0 +gfx950,256,32,14336,1536,ck,7,0,8.7538,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,160.99,2625.92,0.0 +gfx950,256,32,16384,512,ck,13,0,4.8406,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,110.91,1952.97,0.0 +gfx950,256,32,20480,1536,ck,12,0,10.5564,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,190.72,3108.74,0.0 +gfx950,256,32,24576,1536,ck,12,0,11.298,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,213.84,3484.75,0.0 +gfx950,256,32,32768,512,ck,13,0,5.5756,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,192.58,3388.11,0.0 +gfx950,256,32,32768,1536,ck,12,0,12.8162,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,251.34,4094.66,0.0 +gfx950,256,32,36864,7168,ck,9,0,63.4973,a8w8_blockscale_1x128x128_256x32x256x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,266.33,4202.22,0.0 +gfx950,256,64,64,7168,ck,8,0,14.0983,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.17,65.66,0.0 +gfx950,256,64,128,7168,ck,8,0,14.5236,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.09,95.89,0.0 +gfx950,256,64,512,7168,ck,8,0,20.5969,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.81,203.64,0.0 +gfx950,256,64,576,7168,ck,8,0,20.6636,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.58,225.58,0.0 +gfx950,256,64,1024,7168,ck,8,0,20.9843,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,44.77,377.89,0.0 +gfx950,256,64,1536,7168,ck,8,0,21.2119,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,66.44,549.95,0.0 +gfx950,256,64,2112,7168,ck,8,0,20.6114,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,94.01,769.86,0.0 +gfx950,256,64,2240,7168,ck,8,0,20.6316,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,99.61,814.37,0.0 +gfx950,256,64,3072,1536,ck,8,0,6.5488,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,92.23,795.58,0.0 +gfx950,256,64,4096,512,ck,8,0,3.9182,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,68.51,677.4,0.0 +gfx950,256,64,4096,7168,ck,8,0,21.2311,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,177.01,1429.18,0.0 +gfx950,256,64,4608,7168,ck,7,0,23.1112,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,182.94,1474.55,0.0 +gfx950,256,64,7168,256,cktile,0,0,3.3434,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_1,70.25,828.17,0.0 +gfx950,256,64,7168,512,ck,13,0,4.4706,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,105.08,1033.48,0.0 +gfx950,256,64,7168,2048,ck,7,0,8.8766,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,211.69,1771.92,0.0 +gfx950,256,64,7168,2304,ck,7,0,9.7303,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,217.25,1806.73,0.0 +gfx950,256,64,7168,4096,ck,7,0,15.6595,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,239.99,1950.24,0.0 +gfx950,256,64,7168,4608,ck,7,0,16.6612,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,253.75,2055.23,0.0 +gfx950,256,64,7168,16384,ck,18,0,53.066,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,283.28,2250.15,0.0 +gfx950,256,64,7168,18432,ck,18,0,58.5515,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,288.83,2292.3,0.0 +gfx950,256,64,8192,512,ck,13,0,4.4992,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,119.33,1172.57,0.0 +gfx950,256,64,8192,1536,ck,18,0,8.3243,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,193.48,1649.36,0.0 +gfx950,256,64,9216,7168,ck,18,0,25.8746,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,326.8,2616.41,0.0 +gfx950,256,64,11264,1536,ck,18,0,8.4093,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,263.35,2240.57,0.0 +gfx950,256,64,12288,1536,ck,18,0,8.4019,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,287.54,2445.34,0.0 +gfx950,256,64,14336,1536,ck,18,0,8.8597,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,318.13,2703.64,0.0 +gfx950,256,64,16384,512,ck,18,0,5.5061,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,195.01,1910.34,0.0 +gfx950,256,64,20480,1536,ck,18,0,13.129,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,306.69,2603.17,0.0 +gfx950,256,64,24576,1536,ck,18,0,13.7416,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,351.62,2983.11,0.0 +gfx950,256,64,32768,512,ck,2,0,7.9558,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,269.93,2640.12,0.0 +gfx950,256,64,32768,1536,ck,18,0,14.825,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,434.57,3684.6,0.0 +gfx950,256,64,36864,7168,ck,14,0,75.9499,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,445.33,3547.32,0.0 +gfx950,256,96,128,7168,ck,8,0,14.4064,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.23,113.16,0.0 +gfx950,256,96,512,7168,ck,8,0,20.2501,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,34.8,220.07,0.0 +gfx950,256,96,1024,7168,ck,8,0,20.6839,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,68.13,397.64,0.0 +gfx950,256,96,2112,7168,ck,8,0,20.5046,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,141.76,791.65,0.0 +gfx950,256,96,2240,7168,ck,8,0,20.4717,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,150.59,838.94,0.0 +gfx950,256,96,3072,1536,ck,8,0,7.8324,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,115.67,696.58,0.0 +gfx950,256,96,4096,512,ck,13,0,4.2826,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,94.02,684.8,0.0 +gfx950,256,96,4096,7168,ck,7,0,23.3977,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,240.93,1317.85,0.0 +gfx950,256,96,4608,7168,ck,7,0,23.8202,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,266.24,1452.67,0.0 +gfx950,256,96,7168,256,ck,8,0,4.3182,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,81.59,749.35,0.0 +gfx950,256,96,7168,512,ck,13,0,5.319,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,132.48,957.97,0.0 +gfx950,256,96,7168,2048,ck,18,0,9.6386,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,292.43,1686.23,0.0 +gfx950,256,96,7168,2304,ck,18,0,10.3846,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,305.35,1744.17,0.0 +gfx950,256,96,7168,4096,ck,18,0,16.1889,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,348.21,1922.9,0.0 +gfx950,256,96,7168,4608,ck,18,0,17.2691,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,367.23,2017.98,0.0 +gfx950,256,96,7168,16384,ck,18,0,55.0061,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,409.93,2188.66,0.0 +gfx950,256,96,7168,18432,ck,18,0,61.6242,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,411.64,2195.02,0.0 +gfx950,256,96,8192,512,ck,13,0,5.3545,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,150.4,1086.25,0.0 +gfx950,256,96,8192,1536,ck,18,0,8.203,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,294.52,1743.66,0.0 +gfx950,256,96,9216,7168,ck,12,0,28.2134,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,449.56,2428.56,0.0 +gfx950,256,96,11264,1536,ck,18,0,11.0633,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,300.26,1772.68,0.0 +gfx950,256,96,12288,1536,ck,18,0,11.286,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,321.1,1894.48,0.0 +gfx950,256,96,14336,1536,ck,18,0,11.6314,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,363.49,2142.48,0.0 +gfx950,256,96,16384,512,ck,13,0,6.1392,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,262.35,1886.81,0.0 +gfx950,256,96,20480,1536,ck,12,0,14.4047,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,419.29,2467.03,0.0 +gfx950,256,96,24576,1536,ck,18,0,16.4815,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,439.75,2585.61,0.0 +gfx950,256,96,32768,512,ck,13,0,9.4586,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,340.56,2444.11,0.0 +gfx950,256,96,32768,1536,ck,0,0,19.4557,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,496.7,2917.94,0.0 +gfx950,256,96,36864,7168,ck,12,0,95.4929,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,531.29,2848.45,0.0 +gfx950,256,128,64,7168,ck,8,0,14.4734,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.11,96.22,0.0 +gfx950,256,128,128,7168,ck,8,0,14.3264,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.39,130.37,0.0 +gfx950,256,128,512,7168,ck,8,0,19.8912,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.23,237.22,0.0 +gfx950,256,128,576,7168,ck,8,0,20.1014,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,52.58,258.38,0.0 +gfx950,256,128,1024,7168,ck,8,0,20.4235,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,92.0,417.15,0.0 +gfx950,256,128,1536,7168,ck,8,0,19.9837,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,141.04,616.54,0.0 +gfx950,256,128,2112,7168,ck,8,0,21.1709,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,183.06,783.95,0.0 +gfx950,256,128,2240,7168,ck,18,0,23.1376,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,177.65,758.39,0.0 +gfx950,256,128,3072,1536,ck,7,0,7.2074,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,167.6,791.08,0.0 +gfx950,256,128,4096,512,ck,13,0,4.3212,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,124.24,743.14,0.0 +gfx950,256,128,4096,7168,ck,7,0,23.6181,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,318.24,1326.36,0.0 +gfx950,256,128,4608,7168,ck,18,0,23.9089,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,353.66,1469.21,0.0 +gfx950,256,128,7168,256,ck,18,0,4.5079,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,104.21,821.4,0.0 +gfx950,256,128,7168,512,ck,13,0,5.3366,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,176.05,1043.84,0.0 +gfx950,256,128,7168,2048,ck,18,0,9.4978,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,395.68,1766.43,0.0 +gfx950,256,128,7168,2304,ck,18,0,10.3456,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,408.66,1802.21,0.0 +gfx950,256,128,7168,4096,ck,18,0,16.0184,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,469.22,1980.19,0.0 +gfx950,256,128,7168,4608,ck,18,0,17.0629,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,495.56,2077.9,0.0 +gfx950,256,128,7168,16384,ck,18,0,56.7734,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,529.56,2137.84,0.0 +gfx950,256,128,7168,18432,ck,18,0,62.4818,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,541.32,2181.67,0.0 +gfx950,256,128,8192,512,ck,13,0,5.3945,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,199.04,1178.42,0.0 +gfx950,256,128,8192,1536,ck,18,0,8.2154,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,392.1,1810.83,0.0 +gfx950,256,128,9216,7168,ck,18,0,40.6919,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,415.6,1703.95,0.0 +gfx950,256,128,11264,1536,ck,18,0,11.7531,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,376.85,1734.15,0.0 +gfx950,256,128,12288,1536,ck,18,0,12.0014,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,402.61,1851.18,0.0 +gfx950,256,128,14336,1536,ck,18,0,12.3486,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,456.5,2096.33,0.0 +gfx950,256,128,16384,512,ck,9,0,8.0448,a8w8_blockscale_1x128x128_256x32x256x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,266.94,1572.25,0.0 +gfx950,256,128,20480,1536,ck,18,0,16.796,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,479.46,2196.76,0.0 +gfx950,256,128,24576,1536,ck,18,0,17.3757,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,556.16,2545.9,0.0 +gfx950,256,128,32768,512,ck,2,0,10.8423,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,396.13,2327.12,0.0 +gfx950,256,128,32768,1536,ck,0,0,19.2926,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,667.87,3053.86,0.0 +gfx950,256,128,36864,7168,ck,18,0,105.9408,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,638.52,2591.97,0.0 +gfx950,256,160,128,7168,ck,8,0,17.1227,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,17.15,122.96,0.0 +gfx950,256,160,512,7168,ck,8,0,20.0008,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,58.72,249.03,0.0 +gfx950,256,160,1024,7168,ck,8,0,20.4675,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,114.76,430.66,0.0 +gfx950,256,160,2112,7168,ck,8,0,23.2184,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,208.65,730.52,0.0 +gfx950,256,160,2240,7168,ck,18,0,23.4811,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,218.82,763.17,0.0 +gfx950,256,160,3072,1536,ck,7,0,7.4447,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,202.82,798.88,0.0 +gfx950,256,160,4096,512,ck,13,0,5.1861,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,129.4,672.91,0.0 +gfx950,256,160,4096,7168,ck,18,0,24.2819,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,386.92,1310.35,0.0 +gfx950,256,160,4608,7168,ck,18,0,25.144,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,420.36,1417.9,0.0 +gfx950,256,160,7168,256,ck,13,0,5.0474,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,116.34,826.11,0.0 +gfx950,256,160,7168,512,ck,13,0,5.7215,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,205.26,1056.66,0.0 +gfx950,256,160,7168,2048,ck,18,0,12.9511,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,362.72,1335.91,0.0 +gfx950,256,160,7168,2304,ck,18,0,14.0589,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,375.91,1364.08,0.0 +gfx950,256,160,7168,4096,ck,18,0,22.8022,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,412.03,1416.94,0.0 +gfx950,256,160,7168,4608,ck,18,0,25.9107,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,407.93,1391.75,0.0 +gfx950,256,160,7168,16384,ck,18,0,80.0493,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,469.47,1528.5,0.0 +gfx950,256,160,7168,18432,ck,18,0,89.0583,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,474.73,1542.4,0.0 +gfx950,256,160,8192,512,ck,13,0,6.3784,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,210.43,1081.41,0.0 +gfx950,256,160,8192,1536,ck,18,0,11.0356,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,364.87,1400.02,0.0 +gfx950,256,160,9216,7168,ck,18,0,42.7828,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,494.11,1639.82,0.0 +gfx950,256,160,11264,1536,ck,12,0,14.0267,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,394.71,1507.96,0.0 +gfx950,256,160,12288,1536,ck,12,0,14.4605,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,417.68,1594.16,0.0 +gfx950,256,160,14336,1536,ck,18,0,15.5241,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,453.9,1729.79,0.0 +gfx950,256,160,16384,512,ck,13,0,8.5655,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,313.39,1601.0,0.0 +gfx950,256,160,20480,1536,ck,2,0,18.3532,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,548.48,2084.47,0.0 +gfx950,256,160,24576,1536,ck,18,0,23.7575,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,508.45,1930.29,0.0 +gfx950,256,160,32768,512,ck,18,0,13.356,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,401.97,2047.39,0.0 +gfx950,256,160,32768,1536,ck,2,0,27.1819,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,592.53,2246.46,0.0 +gfx950,256,160,36864,7168,cktile,11,0,93.7724,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,901.73,2955.93,0.0 +gfx950,256,192,128,7168,ck,8,0,16.9935,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.73,137.87,0.0 +gfx950,256,192,512,7168,ck,8,0,19.9037,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,70.81,263.41,0.0 +gfx950,256,192,1024,7168,ck,8,0,20.4827,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,137.61,444.74,0.0 +gfx950,256,192,2112,7168,ck,18,0,22.9699,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,253.08,754.29,0.0 +gfx950,256,192,2240,7168,ck,18,0,23.148,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,266.36,790.25,0.0 +gfx950,256,192,3072,1536,ck,18,0,8.4458,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,214.54,733.28,0.0 +gfx950,256,192,4096,512,ck,13,0,5.1841,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,155.34,726.9,0.0 +gfx950,256,192,4096,7168,ck,18,0,23.8819,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,472.09,1352.88,0.0 +gfx950,256,192,4608,7168,ck,18,0,24.2883,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,522.21,1489.44,0.0 +gfx950,256,192,7168,256,ck,13,0,5.1944,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,135.65,892.63,0.0 +gfx950,256,192,7168,512,ck,13,0,6.009,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,234.53,1085.18,0.0 +gfx950,256,192,7168,2048,ck,18,0,13.2851,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,424.32,1341.79,0.0 +gfx950,256,192,7168,2304,ck,18,0,14.6467,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,432.98,1345.69,0.0 +gfx950,256,192,7168,4096,ck,18,0,24.2362,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,465.18,1357.44,0.0 +gfx950,256,192,7168,4608,ck,18,0,25.7255,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,493.04,1425.33,0.0 +gfx950,256,192,7168,16384,ck,18,0,84.2651,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,535.18,1463.7,0.0 +gfx950,256,192,7168,18432,ck,18,0,93.2472,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,544.08,1484.36,0.0 +gfx950,256,192,8192,512,ck,13,0,6.479,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,248.59,1148.07,0.0 +gfx950,256,192,8192,1536,ck,18,0,11.4405,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,422.35,1400.6,0.0 +gfx950,256,192,9216,7168,ck,18,0,43.0984,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,588.59,1646.82,0.0 +gfx950,256,192,11264,1536,ck,18,0,14.8667,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,446.89,1474.56,0.0 +gfx950,256,192,12288,1536,ck,18,0,15.56,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,465.79,1535.21,0.0 +gfx950,256,192,14336,1536,ck,18,0,16.0769,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,525.95,1730.44,0.0 +gfx950,256,192,16384,512,ck,18,0,8.9495,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,359.93,1651.31,0.0 +gfx950,256,192,20480,1536,ck,2,0,19.8064,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,609.88,2000.19,0.0 +gfx950,256,192,24576,1536,ck,18,0,25.1336,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,576.74,1889.14,0.0 +gfx950,256,192,32768,512,ck,17,0,14.141,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,455.59,2083.19,0.0 +gfx950,256,192,32768,1536,ck,2,0,28.3939,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,680.69,2226.16,0.0 +gfx950,256,192,36864,7168,ck,14,0,126.8721,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,799.77,2205.16,0.0 +gfx950,256,224,128,7168,ck,8,0,17.8184,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.07,144.82,0.0 +gfx950,256,224,512,7168,ck,8,0,20.3114,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,80.95,271.03,0.0 +gfx950,256,224,1024,7168,ck,8,0,20.7811,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,158.24,452.55,0.0 +gfx950,256,224,2112,7168,ck,18,0,23.6685,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,286.55,747.43,0.0 +gfx950,256,224,2240,7168,ck,18,0,23.7107,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,303.37,787.22,0.0 +gfx950,256,224,3072,1536,ck,18,0,8.2059,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,257.61,784.67,0.0 +gfx950,256,224,4096,512,ck,13,0,5.3431,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,175.84,757.4,0.0 +gfx950,256,224,4096,7168,ck,18,0,25.5505,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,514.8,1283.76,0.0 +gfx950,256,224,4608,7168,ck,12,0,26.9231,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,549.62,1363.15,0.0 +gfx950,256,224,7168,256,ck,18,0,5.6461,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,145.6,903.92,0.0 +gfx950,256,224,7168,512,ck,18,0,6.4236,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,255.96,1089.1,0.0 +gfx950,256,224,7168,2048,ck,18,0,13.3235,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,493.61,1377.27,0.0 +gfx950,256,224,7168,2304,ck,18,0,14.4191,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,513.12,1403.86,0.0 +gfx950,256,224,7168,4096,ck,18,0,23.8325,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,551.91,1405.18,0.0 +gfx950,256,224,7168,4608,ck,18,0,27.3172,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,541.69,1364.47,0.0 +gfx950,256,224,7168,16384,ck,18,0,87.6811,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,600.05,1417.89,0.0 +gfx950,256,224,7168,18432,ck,18,0,99.6528,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,593.96,1399.47,0.0 +gfx950,256,224,8192,512,ck,17,0,6.4757,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,290.17,1232.15,0.0 +gfx950,256,224,8192,1536,ck,18,0,11.52,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,489.34,1440.71,0.0 +gfx950,256,224,9216,7168,ck,12,0,52.8597,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,559.88,1358.21,0.0 +gfx950,256,224,11264,1536,ck,18,0,15.2931,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,506.83,1483.8,0.0 +gfx950,256,224,12288,1536,ck,18,0,16.4055,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,515.42,1507.02,0.0 +gfx950,256,224,14336,1536,ck,0,0,17.5524,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,562.03,1640.04,0.0 +gfx950,256,224,16384,512,ck,18,0,10.2555,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,366.45,1544.86,0.0 +gfx950,256,224,20480,1536,ck,18,0,24.9844,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,564.07,1640.08,0.0 +gfx950,256,224,24576,1536,ck,2,0,26.2172,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,645.05,1872.92,0.0 +gfx950,256,224,32768,512,ck,0,0,16.1208,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,466.24,1958.46,0.0 +gfx950,256,224,32768,1536,ck,0,0,29.6624,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,760.17,2203.32,0.0 +gfx950,256,224,36864,7168,ck,0,0,153.8167,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,769.62,1835.7,0.0 +gfx950,256,256,64,7168,ck,8,0,16.0919,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.6,144.58,0.0 +gfx950,256,256,128,7168,ck,8,0,18.917,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,24.83,148.97,0.0 +gfx950,256,256,512,7168,ck,8,0,20.1783,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,93.12,285.81,0.0 +gfx950,256,256,576,7168,ck,8,0,20.4521,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,103.36,306.02,0.0 +gfx950,256,256,1024,7168,ck,8,0,20.8683,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,180.09,464.79,0.0 +gfx950,256,256,1536,7168,ck,7,0,22.5366,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,250.13,604.86,0.0 +gfx950,256,256,2112,7168,ck,18,0,23.0603,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,336.12,782.95,0.0 +gfx950,256,256,2240,7168,ck,18,0,23.1962,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,354.4,820.75,0.0 +gfx950,256,256,3072,1536,ck,18,0,8.6547,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,279.15,772.37,0.0 +gfx950,256,256,4096,512,ck,13,0,5.143,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,208.78,841.02,0.0 +gfx950,256,256,4096,7168,ck,18,0,24.9567,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,602.34,1334.0,0.0 +gfx950,256,256,4608,7168,ck,18,0,36.0444,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,469.18,1032.74,0.0 +gfx950,256,256,7168,256,cktile,6,0,5.3481,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,175.67,1041.6,0.0 +gfx950,256,256,7168,512,ck,18,0,6.913,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,271.81,1080.73,0.0 +gfx950,256,256,7168,2048,ck,18,0,13.6951,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,548.82,1378.18,0.0 +gfx950,256,256,7168,2304,ck,18,0,15.2977,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,552.74,1358.04,0.0 +gfx950,256,256,7168,4096,ck,18,0,25.2675,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,594.93,1348.72,0.0 +gfx950,256,256,7168,4608,ck,18,0,27.5468,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,613.92,1375.11,0.0 +gfx950,256,256,7168,16384,ck,18,0,89.7873,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,669.69,1395.57,0.0 +gfx950,256,256,7168,18432,ck,18,0,99.4348,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,680.3,1413.08,0.0 +gfx950,256,256,8192,512,ck,18,0,7.299,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,294.22,1167.24,0.0 +gfx950,256,256,8192,1536,ck,18,0,11.8143,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,545.31,1453.36,0.0 +gfx950,256,256,9216,7168,ck,0,0,56.4314,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,599.36,1286.76,0.0 +gfx950,256,256,11264,1536,ck,18,0,15.893,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,557.38,1476.24,0.0 +gfx950,256,256,12288,1536,ck,0,0,16.6451,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,580.57,1535.53,0.0 +gfx950,256,256,14336,1536,ck,0,0,17.292,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,651.99,1720.64,0.0 +gfx950,256,256,16384,512,ck,0,0,10.4597,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,410.62,1616.52,0.0 +gfx950,256,256,20480,1536,ck,18,0,23.7982,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,676.78,1778.97,0.0 +gfx950,256,256,24576,1536,ck,2,0,27.2881,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,708.27,1858.86,0.0 +gfx950,256,256,32768,512,ck,0,0,16.3211,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,526.31,2063.92,0.0 +gfx950,256,256,32768,1536,ck,0,0,31.1808,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,826.46,2164.86,0.0 +gfx950,256,256,36864,7168,ck,0,0,160.9084,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,840.8,1770.89,0.0 +gfx950,256,288,128,7168,ck,8,0,19.0991,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,27.67,159.99,0.0 +gfx950,256,288,512,7168,ck,8,0,20.3923,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,103.66,295.67,0.0 +gfx950,256,288,1024,7168,ck,8,0,21.7244,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,194.61,460.05,0.0 +gfx950,256,288,2112,7168,ck,18,0,24.2926,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,358.96,758.24,0.0 +gfx950,256,288,2240,7168,ck,18,0,24.2274,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,381.73,801.2,0.0 +gfx950,256,288,3072,1536,ck,12,0,8.9072,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,305.14,778.07,0.0 +gfx950,256,288,4096,512,ck,13,0,5.4656,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,221.01,842.34,0.0 +gfx950,256,288,4096,7168,ck,18,0,36.2776,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,466.17,931.26,0.0 +gfx950,256,288,4608,7168,ck,18,0,34.9489,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,544.38,1080.11,0.0 +gfx950,256,288,7168,256,ck,13,0,5.8969,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,179.24,1023.84,0.0 +gfx950,256,288,7168,512,ck,12,0,7.8664,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,268.73,1010.15,0.0 +gfx950,256,288,7168,2048,ck,12,0,16.1075,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,524.96,1204.32,0.0 +gfx950,256,288,7168,2304,ck,12,0,17.6223,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,539.81,1209.12,0.0 +gfx950,256,288,7168,4096,ck,12,0,29.9438,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,564.77,1157.79,0.0 +gfx950,256,288,7168,4608,ck,12,0,34.1004,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,557.92,1128.61,0.0 +gfx950,256,288,7168,16384,ck,12,0,111.0457,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,609.17,1137.26,0.0 +gfx950,256,288,7168,18432,ck,12,0,123.9056,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,614.19,1142.46,0.0 +gfx950,256,288,8192,512,ck,13,0,8.3615,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,288.93,1083.58,0.0 +gfx950,256,288,8192,1536,ck,18,0,15.326,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,472.91,1157.76,0.0 +gfx950,256,288,9216,7168,ck,0,0,60.4578,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,629.38,1214.62,0.0 +gfx950,256,288,11264,1536,ck,2,0,18.8467,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,528.78,1285.74,0.0 +gfx950,256,288,12288,1536,ck,2,0,19.4542,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,558.83,1356.76,0.0 +gfx950,256,288,14336,1536,ck,18,0,23.6832,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,535.55,1297.12,0.0 +gfx950,256,288,16384,512,ck,18,0,12.3165,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,392.31,1459.28,0.0 +gfx950,256,288,20480,1536,ck,0,0,28.7904,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,629.36,1517.73,0.0 +gfx950,256,288,24576,1536,ck,14,0,34.4764,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,630.67,1518.34,0.0 +gfx950,256,288,32768,512,ck,17,0,20.5268,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,470.78,1744.01,0.0 +gfx950,256,288,32768,1536,cktile,26,0,32.7131,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,886.22,2129.07,0.0 +gfx950,256,288,36864,7168,ck,14,0,185.2145,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,821.77,1552.47,0.0 +gfx950,256,320,128,7168,ck,8,0,19.3483,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,30.35,170.21,0.0 +gfx950,256,320,512,7168,ck,8,0,20.4108,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,115.08,308.24,0.0 +gfx950,256,320,1024,7168,ck,8,0,21.7255,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,216.23,473.6,0.0 +gfx950,256,320,2112,7168,ck,18,0,23.4441,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,413.27,801.24,0.0 +gfx950,256,320,2240,7168,ck,18,0,23.4986,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,437.3,841.91,0.0 +gfx950,256,320,3072,1536,ck,12,0,8.7907,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,343.53,816.34,0.0 +gfx950,256,320,4096,512,ck,13,0,6.1473,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,218.34,794.24,0.0 +gfx950,256,320,4096,7168,ck,18,0,36.1058,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,520.43,949.3,0.0 +gfx950,256,320,4608,7168,ck,18,0,37.9116,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,557.59,1009.53,0.0 +gfx950,256,320,7168,256,ck,18,0,6.0807,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,193.14,1069.69,0.0 +gfx950,256,320,7168,512,ck,13,0,8.1355,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,288.71,1035.14,0.0 +gfx950,256,320,7168,2048,ck,18,0,17.897,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,524.96,1113.2,0.0 +gfx950,256,320,7168,2304,ck,18,0,20.2519,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,521.91,1078.41,0.0 +gfx950,256,320,7168,4096,ck,18,0,33.2548,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,565.05,1060.25,0.0 +gfx950,256,320,7168,4608,ck,18,0,34.9015,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,605.68,1120.07,0.0 +gfx950,256,320,7168,16384,ck,2,0,114.1333,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,658.55,1115.11,0.0 +gfx950,256,320,7168,18432,ck,2,0,129.9128,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,650.88,1097.71,0.0 +gfx950,256,320,8192,512,ck,13,0,8.9484,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,299.98,1072.93,0.0 +gfx950,256,320,8192,1536,ck,18,0,15.3877,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,523.34,1190.39,0.0 +gfx950,256,320,9216,7168,ck,0,0,60.0342,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,704.24,1236.83,0.0 +gfx950,256,320,11264,1536,ck,2,0,18.2441,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,606.93,1370.41,0.0 +gfx950,256,320,12288,1536,ck,14,0,19.4937,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,619.67,1396.87,0.0 +gfx950,256,320,14336,1536,ck,18,0,24.0771,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,585.32,1316.05,0.0 +gfx950,256,320,16384,512,ck,18,0,12.5931,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,426.32,1511.8,0.0 +gfx950,256,320,20480,1536,ck,0,0,29.2128,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,689.17,1542.34,0.0 +gfx950,256,320,24576,1536,ck,14,0,34.3884,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,702.54,1569.39,0.0 +gfx950,256,320,32768,512,ck,2,0,21.6599,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,495.73,1750.36,0.0 +gfx950,256,320,32768,1536,cktile,28,0,33.3128,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,966.96,2155.17,0.0 +gfx950,256,320,36864,7168,ck,0,0,196.9898,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,858.49,1472.81,0.0 +gfx950,256,352,128,7168,ck,8,0,19.7316,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,32.74,178.94,0.0 +gfx950,256,352,512,7168,ck,8,0,20.4097,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,126.59,321.1,0.0 +gfx950,256,352,1024,7168,ck,8,0,22.0666,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,234.17,479.64,0.0 +gfx950,256,352,2112,7168,ck,18,0,24.9136,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,427.79,768.61,0.0 +gfx950,256,352,2240,7168,ck,18,0,24.7563,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,456.6,814.19,0.0 +gfx950,256,352,3072,1536,ck,18,0,10.6109,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,313.06,699.46,0.0 +gfx950,256,352,4096,512,ck,17,0,6.2324,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,236.89,828.09,0.0 +gfx950,256,352,4096,7168,ck,18,0,39.6236,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,521.65,877.43,0.0 +gfx950,256,352,4608,7168,ck,18,0,35.2899,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,658.92,1099.39,0.0 +gfx950,256,352,7168,256,ck,13,0,6.2709,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,206.01,1111.71,0.0 +gfx950,256,352,7168,512,ck,13,0,8.2786,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,312.09,1074.64,0.0 +gfx950,256,352,7168,2048,ck,18,0,18.526,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,557.85,1103.7,0.0 +gfx950,256,352,7168,2304,ck,18,0,20.7383,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,560.63,1078.79,0.0 +gfx950,256,352,7168,4096,ck,0,0,34.4901,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,599.29,1039.38,0.0 +gfx950,256,352,7168,4608,ck,18,0,37.1048,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,626.69,1069.9,0.0 +gfx950,256,352,7168,16384,ck,0,0,120.4757,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,686.26,1064.56,0.0 +gfx950,256,352,7168,18432,ck,0,0,135.6072,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,685.9,1059.35,0.0 +gfx950,256,352,8192,512,ck,18,0,8.9245,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,330.86,1136.39,0.0 +gfx950,256,352,8192,1536,ck,18,0,15.8912,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,557.44,1188.76,0.0 +gfx950,256,352,9216,7168,ck,0,0,62.6434,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,742.4,1198.39,0.0 +gfx950,256,352,11264,1536,ck,18,0,23.2507,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,523.87,1108.44,0.0 +gfx950,256,352,12288,1536,ck,18,0,23.3287,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,569.58,1203.06,0.0 +gfx950,256,352,14336,1536,ck,2,0,25.9576,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,597.21,1257.95,0.0 +gfx950,256,352,16384,512,ck,18,0,13.8423,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,426.63,1452.3,0.0 +gfx950,256,352,20480,1536,ck,0,0,29.7336,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,744.81,1561.06,0.0 +gfx950,256,352,24576,1536,cktile,26,0,31.6868,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,838.68,1754.39,0.0 +gfx950,256,352,32768,512,ck,0,0,22.2946,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,529.78,1795.33,0.0 +gfx950,256,352,32768,1536,cktile,26,0,33.7222,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1050.75,2192.65,0.0 +gfx950,256,352,36864,7168,cktile,26,0,160.365,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1160.01,1825.31,0.0 +gfx950,256,384,128,7168,ck,8,0,19.7154,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,35.74,191.14,0.0 +gfx950,256,384,512,7168,ck,8,0,20.3983,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,138.18,334.13,0.0 +gfx950,256,384,1024,7168,ck,8,0,22.3438,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,252.29,486.89,0.0 +gfx950,256,384,2112,7168,ck,18,0,23.7978,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,488.56,819.96,0.0 +gfx950,256,384,2240,7168,ck,18,0,23.7826,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,518.5,863.2,0.0 +gfx950,256,384,3072,1536,ck,18,0,10.5698,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,342.85,725.44,0.0 +gfx950,256,384,4096,512,ck,13,0,6.3161,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,255.0,861.21,0.0 +gfx950,256,384,4096,7168,ck,18,0,34.9755,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,644.7,1008.09,0.0 +gfx950,256,384,4608,7168,ck,18,0,42.9873,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,590.11,914.73,0.0 +gfx950,256,384,7168,256,ck,18,0,6.5069,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,216.58,1143.15,0.0 +gfx950,256,384,7168,512,ck,18,0,8.5934,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,327.99,1090.56,0.0 +gfx950,256,384,7168,2048,ck,18,0,18.9583,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,594.69,1106.19,0.0 +gfx950,256,384,7168,2304,ck,0,0,20.7129,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,612.35,1105.82,0.0 +gfx950,256,384,7168,4096,ck,0,0,34.1125,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,661.01,1068.17,0.0 +gfx950,256,384,7168,4608,ck,0,0,37.3568,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,679.05,1078.91,0.0 +gfx950,256,384,7168,16384,ck,0,0,116.6161,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,773.43,1108.23,0.0 +gfx950,256,384,7168,18432,ck,0,0,137.0626,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,740.31,1055.75,0.0 +gfx950,256,384,8192,512,ck,18,0,8.908,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,361.61,1199.19,0.0 +gfx950,256,384,8192,1536,ck,18,0,15.9312,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,606.59,1221.77,0.0 +gfx950,256,384,9216,7168,ck,0,0,62.323,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,814.05,1217.7,0.0 +gfx950,256,384,11264,1536,ck,18,0,22.6307,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,587.15,1172.84,0.0 +gfx950,256,384,12288,1536,ck,18,0,23.4391,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,618.43,1233.04,0.0 +gfx950,256,384,14336,1536,ck,2,0,25.7776,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,656.05,1304.23,0.0 +gfx950,256,384,16384,512,ck,17,0,13.9619,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,461.43,1516.14,0.0 +gfx950,256,384,20480,1536,ck,0,0,28.7288,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,840.94,1662.99,0.0 +gfx950,256,384,24576,1536,cktile,11,0,32.299,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,897.58,1771.35,0.0 +gfx950,256,384,32768,512,ck,0,0,23.0787,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,558.3,1825.91,0.0 +gfx950,256,384,32768,1536,cktile,28,0,34.4376,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1122.46,2209.43,0.0 +gfx950,256,384,36864,7168,ck,0,0,210.4362,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,964.36,1403.3,0.0 +gfx950,256,416,128,7168,ck,8,0,20.0269,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,38.12,200.03,0.0 +gfx950,256,416,512,7168,ck,8,0,20.6355,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,147.97,343.0,0.0 +gfx950,256,416,1024,7168,ck,8,0,22.7135,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,268.87,491.95,0.0 +gfx950,256,416,2112,7168,ck,18,0,25.1696,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,500.42,789.76,0.0 +gfx950,256,416,2240,7168,ck,18,0,26.0516,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,512.78,802.33,0.0 +gfx950,256,416,3072,1536,ck,18,0,10.8287,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,362.54,730.79,0.0 +gfx950,256,416,4096,512,ck,17,0,6.4007,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,272.6,893.34,0.0 +gfx950,256,416,4096,7168,ck,18,0,38.7328,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,630.67,922.99,0.0 +gfx950,256,416,4608,7168,ck,18,0,40.166,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,684.19,992.03,0.0 +gfx950,256,416,7168,256,ck,13,0,6.5711,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,232.34,1203.04,0.0 +gfx950,256,416,7168,512,ck,13,0,9.2876,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,328.77,1060.21,0.0 +gfx950,256,416,7168,2048,ck,0,0,21.113,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,578.5,1018.13,0.0 +gfx950,256,416,7168,2304,ck,0,0,22.2607,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,617.26,1052.86,0.0 +gfx950,256,416,7168,4096,ck,0,0,37.3648,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,653.76,990.98,0.0 +gfx950,256,416,7168,4608,ck,0,0,37.6687,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,729.55,1086.07,0.0 +gfx950,256,416,7168,16384,ck,0,0,127.2482,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,767.87,1023.35,0.0 +gfx950,256,416,7168,18432,ck,0,0,140.696,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,781.29,1035.94,0.0 +gfx950,256,416,8192,512,ck,0,0,10.1211,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,344.79,1108.88,0.0 +gfx950,256,416,8192,1536,ck,0,0,17.5627,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,596.09,1140.92,0.0 +gfx950,256,416,9216,7168,ck,14,0,69.5662,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,790.07,1102.69,0.0 +gfx950,256,416,11264,1536,ck,18,0,25.0123,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,575.51,1091.95,0.0 +gfx950,256,416,12288,1536,ck,2,0,25.9648,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,604.8,1145.28,0.0 +gfx950,256,416,14336,1536,ck,0,0,27.6412,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,662.8,1251.27,0.0 +gfx950,256,416,16384,512,ck,18,0,15.1957,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,459.3,1463.12,0.0 +gfx950,256,416,20480,1536,cktile,28,0,31.9647,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,818.79,1537.18,0.0 +gfx950,256,416,24576,1536,ck,0,0,43.5537,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,721.11,1350.86,0.0 +gfx950,256,416,32768,512,ck,0,0,27.9556,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,499.31,1582.98,0.0 +gfx950,256,416,32768,1536,ck,0,0,57.2349,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,731.65,1366.89,0.0 +gfx950,256,416,36864,7168,cktile,28,0,174.5087,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1259.81,1707.04,0.0 +gfx950,256,448,128,7168,ck,8,0,19.9415,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.22,212.8,0.0 +gfx950,256,448,512,7168,ck,8,0,20.6355,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,159.35,355.7,0.0 +gfx950,256,448,1024,7168,ck,18,0,23.1171,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,284.49,496.12,0.0 +gfx950,256,448,2112,7168,ck,18,0,23.7136,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,572.01,853.62,0.0 +gfx950,256,448,2240,7168,ck,18,0,25.0899,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,573.4,847.94,0.0 +gfx950,256,448,3072,1536,ck,18,0,10.8713,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,388.9,750.53,0.0 +gfx950,256,448,4096,512,ck,13,0,6.8369,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,274.84,877.09,0.0 +gfx950,256,448,4096,7168,ck,18,0,42.0773,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,625.2,861.31,0.0 +gfx950,256,448,4608,7168,ck,18,0,44.0377,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,672.04,916.72,0.0 +gfx950,256,448,7168,256,ck,18,0,7.088,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,231.96,1181.18,0.0 +gfx950,256,448,7168,512,ck,2,0,9.9265,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,331.27,1039.83,0.0 +gfx950,256,448,7168,2048,ck,0,0,21.0124,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,625.98,1047.96,0.0 +gfx950,256,448,7168,2304,ck,0,0,21.9519,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,674.09,1091.92,0.0 +gfx950,256,448,7168,4096,ck,0,0,37.988,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,692.5,990.25,0.0 +gfx950,256,448,7168,4608,ck,0,0,39.782,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,743.93,1043.61,0.0 +gfx950,256,448,7168,16384,ck,0,0,127.2333,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,827.04,1031.2,0.0 +gfx950,256,448,7168,18432,ck,0,0,141.5706,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,836.19,1036.94,0.0 +gfx950,256,448,8192,512,ck,0,0,10.2386,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,367.05,1148.96,0.0 +gfx950,256,448,8192,1536,ck,0,0,17.232,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,654.26,1196.09,0.0 +gfx950,256,448,9216,7168,ck,14,0,69.6246,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,850.13,1113.53,0.0 +gfx950,256,448,11264,1536,ck,18,0,24.1979,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,640.64,1160.52,0.0 +gfx950,256,448,12288,1536,ck,2,0,25.5359,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,662.26,1197.24,0.0 +gfx950,256,448,14336,1536,ck,0,0,27.4252,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,719.41,1296.37,0.0 +gfx950,256,448,16384,512,ck,18,0,15.8837,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,473.2,1466.79,0.0 +gfx950,256,448,20480,1536,ck,2,0,40.9444,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,688.39,1233.27,0.0 +gfx950,256,448,24576,1536,ck,0,0,45.0677,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,750.49,1341.47,0.0 +gfx950,256,448,32768,512,ck,0,0,27.7693,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,541.33,1669.71,0.0 +gfx950,256,448,32768,1536,ck,0,0,58.9473,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,765.04,1363.59,0.0 +gfx950,256,448,36864,7168,ck,0,0,244.4989,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,968.35,1228.97,0.0 +gfx950,256,480,128,7168,ck,8,0,20.0486,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.93,223.51,0.0 +gfx950,256,480,512,7168,ck,8,0,20.5967,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,171.06,369.1,0.0 +gfx950,256,480,1024,7168,ck,7,0,23.5657,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,299.01,499.19,0.0 +gfx950,256,480,2112,7168,ck,18,0,32.4638,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,447.68,634.77,0.0 +gfx950,256,480,2240,7168,ck,18,0,34.7895,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,443.07,622.24,0.0 +gfx950,256,480,3072,1536,ck,18,0,11.1376,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,406.72,754.65,0.0 +gfx950,256,480,4096,512,ck,18,0,6.3449,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,317.3,988.99,0.0 +gfx950,256,480,4096,7168,ck,2,0,44.0884,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,639.3,833.17,0.0 +gfx950,256,480,4608,7168,ck,18,0,52.5105,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,603.86,778.79,0.0 +gfx950,256,480,7168,256,ck,13,0,7.4843,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,235.37,1181.03,0.0 +gfx950,256,480,7168,512,ck,18,0,10.0728,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,349.78,1071.9,0.0 +gfx950,256,480,7168,2048,ck,0,0,21.2321,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,663.75,1061.81,0.0 +gfx950,256,480,7168,2304,ck,0,0,22.08,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,718.05,1109.7,0.0 +gfx950,256,480,7168,4096,ck,0,0,39.2544,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,718.03,973.33,0.0 +gfx950,256,480,7168,4608,ck,0,0,40.9212,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,774.88,1029.38,0.0 +gfx950,256,480,7168,16384,ck,0,0,129.3282,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,871.76,1022.1,0.0 +gfx950,256,480,7168,18432,ck,0,0,144.4652,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,877.97,1023.42,0.0 +gfx950,256,480,8192,512,ck,0,0,10.3243,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,390.01,1191.79,0.0 +gfx950,256,480,8192,1536,ck,0,0,17.4656,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,691.62,1212.93,0.0 +gfx950,256,480,9216,7168,ck,0,0,92.5317,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,685.36,846.72,0.0 +gfx950,256,480,11264,1536,ck,2,0,26.6676,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,622.83,1081.92,0.0 +gfx950,256,480,12288,1536,ck,0,0,26.8339,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,675.24,1170.46,0.0 +gfx950,256,480,14336,1536,ck,0,0,27.936,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,756.7,1307.27,0.0 +gfx950,256,480,16384,512,ck,0,0,16.0696,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,501.14,1516.09,0.0 +gfx950,256,480,20480,1536,cktile,27,0,32.2625,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,936.04,1607.3,0.0 +gfx950,256,480,24576,1536,ck,0,0,46.2753,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,783.11,1341.51,0.0 +gfx950,256,480,32768,512,ck,0,0,29.29,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,549.88,1655.18,0.0 +gfx950,256,480,32768,1536,ck,0,0,57.6721,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,837.81,1430.96,0.0 +gfx950,256,480,36864,7168,cktile,12,0,181.862,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,1394.86,1666.49,0.0 +gfx950,256,512,64,7168,ck,8,0,19.8437,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.67,211.37,0.0 +gfx950,256,512,128,7168,ck,8,0,19.9742,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.04,236.23,0.0 +gfx950,256,512,512,7168,ck,8,0,20.7315,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,181.27,379.34,0.0 +gfx950,256,512,576,7168,ck,18,0,22.9789,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,183.99,365.06,0.0 +gfx950,256,512,1024,7168,ck,18,0,23.1935,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,324.06,519.91,0.0 +gfx950,256,512,1536,7168,ck,18,0,23.3788,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,482.24,695.2,0.0 +gfx950,256,512,2112,7168,ck,18,0,32.0652,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,483.46,654.03,0.0 +gfx950,256,512,2240,7168,ck,18,0,34.4425,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,477.37,639.33,0.0 +gfx950,256,512,3072,1536,ck,18,0,11.2304,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,430.25,770.3,0.0 +gfx950,256,512,4096,512,ck,18,0,6.7823,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,316.63,966.28,0.0 +gfx950,256,512,4096,7168,ck,18,0,44.5905,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,674.24,834.81,0.0 +gfx950,256,512,4608,7168,ck,18,0,53.8717,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,627.84,768.84,0.0 +gfx950,256,512,7168,256,cktile,10,0,7.6955,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,244.17,1209.29,0.0 +gfx950,256,512,7168,512,ck,0,0,9.9586,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,377.37,1131.91,0.0 +gfx950,256,512,7168,2048,ck,0,0,20.7712,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,723.71,1110.61,0.0 +gfx950,256,512,7168,2304,ck,0,0,22.3331,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,757.24,1120.97,0.0 +gfx950,256,512,7168,4096,ck,0,0,37.9824,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,791.54,1021.45,0.0 +gfx950,256,512,7168,4608,ck,0,0,38.6734,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,874.58,1104.88,0.0 +gfx950,256,512,7168,16384,ck,0,0,126.7885,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,948.5,1050.33,0.0 +gfx950,256,512,7168,18432,ck,0,0,140.2762,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,964.46,1061.46,0.0 +gfx950,256,512,8192,512,ck,0,0,10.2043,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,420.9,1258.79,0.0 +gfx950,256,512,8192,1536,ck,0,0,17.0218,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,756.96,1278.24,0.0 +gfx950,256,512,9216,7168,ck,18,0,93.1119,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,726.5,850.24,0.0 +gfx950,256,512,11264,1536,ck,0,0,26.045,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,680.24,1137.35,0.0 +gfx950,256,512,12288,1536,ck,2,0,26.2594,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,736.02,1227.89,0.0 +gfx950,256,512,14336,1536,ck,0,0,27.3124,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,825.58,1372.51,0.0 +gfx950,256,512,16384,512,ck,0,0,16.3731,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,524.64,1553.03,0.0 +gfx950,256,512,20480,1536,cktile,27,0,32.2441,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,999.01,1650.39,0.0 +gfx950,256,512,24576,1536,ck,0,0,46.3688,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,833.64,1373.79,0.0 +gfx950,256,512,32768,512,ck,0,0,29.3632,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,585.08,1723.03,0.0 +gfx950,256,512,32768,1536,ck,0,0,58.6281,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,879.09,1444.23,0.0 +gfx950,256,512,36864,7168,ck,0,0,256.8593,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1053.43,1189.99,0.0 +gfx950,256,1024,64,7168,ck,8,0,20.1439,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.64,393.66,0.0 +gfx950,256,1024,128,7168,ck,8,0,20.3747,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,92.22,418.15,0.0 +gfx950,256,1024,512,7168,ck,7,0,22.8839,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,328.45,526.95,0.0 +gfx950,256,1024,576,7168,ck,18,0,23.2238,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,364.1,544.63,0.0 +gfx950,256,1024,1024,7168,ck,18,0,24.4927,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,613.75,684.99,0.0 +gfx950,256,1024,1536,7168,ck,18,0,38.4168,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,586.95,559.54,0.0 +gfx950,256,1024,2112,7168,ck,18,0,51.2969,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,604.41,522.53,0.0 +gfx950,256,1024,2240,7168,ck,18,0,53.7053,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,612.29,521.06,0.0 +gfx950,256,1024,3072,1536,ck,18,0,16.0053,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,603.78,786.17,0.0 +gfx950,256,1024,4096,512,ck,0,0,10.0853,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,425.86,1091.69,0.0 +gfx950,256,1024,4096,7168,ck,0,0,63.9434,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,940.36,705.14,0.0 +gfx950,256,1024,4608,7168,cktile,11,0,85.0782,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,795.1,585.43,0.0 +gfx950,256,1024,7168,256,cktile,10,0,11.2282,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,334.7,1494.2,0.0 +gfx950,256,1024,7168,512,ck,0,0,15.4877,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,485.3,1218.67,0.0 +gfx950,256,1024,7168,2048,ck,0,0,34.0461,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,883.06,923.96,0.0 +gfx950,256,1024,7168,2304,ck,0,0,35.8476,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,943.52,936.03,0.0 +gfx950,256,1024,7168,4096,ck,0,0,63.147,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,952.22,763.84,0.0 +gfx950,256,1024,7168,4608,ck,0,0,71.2174,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,949.85,736.18,0.0 +gfx950,256,1024,7168,16384,ck,0,0,216.3222,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1111.85,688.31,0.0 +gfx950,256,1024,7168,18432,ck,0,0,254.6017,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1062.77,650.72,0.0 +gfx950,256,1024,8192,512,ck,0,0,15.952,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,538.49,1347.53,0.0 +gfx950,256,1024,8192,1536,ck,0,0,29.156,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,883.86,1060.95,0.0 +gfx950,256,1024,9216,7168,cktile,28,0,88.3744,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1530.89,1044.13,0.0 +gfx950,256,1024,11264,1536,ck,0,0,42.788,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,828.12,980.25,0.0 +gfx950,256,1024,12288,1536,ck,0,0,45.7325,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,845.23,997.39,0.0 +gfx950,256,1024,14336,1536,ck,0,0,54.4605,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,828.07,972.32,0.0 +gfx950,256,1024,16384,512,ck,0,0,29.472,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,582.92,1440.94,0.0 +gfx950,256,1024,20480,1536,ck,0,0,74.004,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,870.55,1013.1,0.0 +gfx950,256,1024,24576,1536,ck,0,0,84.1555,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,918.65,1065.33,0.0 +gfx950,256,1024,32768,512,ck,0,0,54.7645,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,627.41,1541.33,0.0 +gfx950,256,1024,32768,1536,ck,0,0,109.5044,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,941.32,1086.84,0.0 +gfx950,256,1024,36864,7168,ck,0,0,454.1379,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1191.63,764.26,0.0 +gfx950,256,1536,512,7168,ck,18,0,23.412,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,481.56,694.21,0.0 +gfx950,256,1536,576,7168,ck,18,0,24.205,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,524.01,698.55,0.0 +gfx950,256,1536,1536,7168,ck,18,0,52.7469,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,641.23,506.92,0.0 +gfx950,256,1536,3072,1536,ck,18,0,23.0151,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,629.83,717.58,0.0 +gfx950,256,1536,4096,512,ck,18,0,13.642,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,472.25,1133.74,0.0 +gfx950,256,1536,4608,7168,cktile,11,0,90.2343,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1124.5,644.94,0.0 +gfx950,256,1536,7168,256,cktile,10,0,13.3199,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,423.21,1820.46,0.0 +gfx950,256,1536,7168,2048,cktile,11,0,37.6651,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1197.32,1057.9,0.0 +gfx950,256,1536,7168,2304,cktile,28,0,40.5466,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1251.26,1037.67,0.0 +gfx950,256,2048,64,7168,ck,8,0,19.7903,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,94.95,778.21,0.0 +gfx950,256,2048,128,7168,ck,8,0,20.2497,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,185.59,796.15,0.0 +gfx950,256,2048,512,7168,ck,18,0,24.9633,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,602.18,819.09,0.0 +gfx950,256,2048,576,7168,ck,18,0,33.9564,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,498.03,623.39,0.0 +gfx950,256,2048,1024,7168,ck,18,0,42.9028,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,700.76,611.02,0.0 +gfx950,256,2048,1536,7168,ck,0,0,57.9353,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,778.41,552.02,0.0 +gfx950,256,2048,2112,7168,ck,2,0,88.3043,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,702.21,435.65,0.0 +gfx950,256,2048,2240,7168,ck,2,0,89.9975,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,730.76,443.47,0.0 +gfx950,256,2048,3072,1536,ck,0,0,25.9919,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,743.59,786.68,0.0 +gfx950,256,2048,4096,512,ck,0,0,15.9389,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,538.93,1249.96,0.0 +gfx950,256,2048,4096,7168,ck,0,0,114.7033,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1048.44,530.21,0.0 +gfx950,256,2048,4608,7168,cktile,27,0,88.3939,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1530.55,753.27,0.0 +gfx950,256,2048,7168,256,cktile,10,0,18.4391,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,407.62,1720.23,0.0 +gfx950,256,2048,7168,512,ck,0,0,27.0315,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,556.11,1260.7,0.0 +gfx950,256,2048,7168,2048,ck,0,0,68.5722,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,876.88,703.41,0.0 +gfx950,256,2048,7168,2304,ck,0,0,69.6842,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,970.75,726.04,0.0 +gfx950,256,2048,7168,4096,ck,0,0,118.0138,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1019.03,568.65,0.0 +gfx950,256,2048,7168,4608,ck,0,0,128.9241,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1049.39,557.13,0.0 +gfx950,256,2048,7168,16384,ck,0,0,415.9935,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1156.36,433.55,0.0 +gfx950,256,2048,7168,18432,ck,0,0,473.0973,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1143.88,421.12,0.0 +gfx950,256,2048,8192,512,ck,0,0,29.4968,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,582.43,1315.31,0.0 +gfx950,256,2048,8192,1536,ck,0,0,56.6629,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,909.58,869.76,0.0 +gfx950,256,2048,9216,7168,cktile,26,0,172.1216,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1572.05,688.4,0.0 +gfx950,256,2048,11264,1536,cktile,27,0,62.8212,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1128.07,1059.91,0.0 +gfx950,256,2048,12288,1536,ck,0,0,83.8143,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,922.39,863.24,0.0 +gfx950,256,2048,14336,1536,ck,0,0,96.0698,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,938.84,873.18,0.0 +gfx950,256,2048,16384,512,ck,0,0,55.3461,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,620.82,1383.04,0.0 +gfx950,256,2048,20480,1536,ck,0,0,132.3682,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,973.41,895.15,0.0 +gfx950,256,2048,24576,1536,ck,0,0,153.4331,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1007.73,922.6,0.0 +gfx950,256,2048,32768,512,ck,0,0,103.5934,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,663.36,1467.7,0.0 +gfx950,256,2048,32768,1536,ck,0,0,196.9804,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1046.59,952.86,0.0 +gfx950,256,2048,36864,7168,cktile,27,0,554.6038,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1951.54,775.18,0.0 +gfx950,256,4096,64,7168,ck,8,0,21.3467,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,176.05,1421.45,0.0 +gfx950,256,4096,128,7168,ck,18,0,23.7564,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,316.39,1318.64,0.0 +gfx950,256,4096,512,7168,ck,18,0,44.3253,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,678.28,839.8,0.0 +gfx950,256,4096,576,7168,ck,18,0,55.4301,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,610.19,689.29,0.0 +gfx950,256,4096,1024,7168,ck,0,0,66.6998,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,901.5,676.0,0.0 +gfx950,256,4096,1536,7168,cktile,11,0,90.0488,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1001.62,588.05,0.0 +gfx950,256,4096,2112,7168,cktile,28,0,91.0323,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1362.34,678.88,0.0 +gfx950,256,4096,2240,7168,cktile,27,0,91.0389,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1444.8,700.43,0.0 +gfx950,256,4096,3072,1536,ck,0,0,42.2397,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,915.13,856.44,0.0 +gfx950,256,4096,4096,512,ck,0,0,28.9648,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,593.13,1303.26,0.0 +gfx950,256,4096,4096,7168,ck,0,0,209.3355,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1148.96,440.8,0.0 +gfx950,256,4096,4608,7168,cktile,28,0,173.1867,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1562.38,578.21,0.0 +gfx950,256,4096,7168,256,cktile,10,0,29.5555,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,508.62,2084.34,0.0 +gfx950,256,4096,7168,512,ck,0,0,46.6829,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,644.02,1381.39,0.0 +gfx950,256,4096,7168,2048,ck,0,0,120.1982,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1000.51,680.45,0.0 +gfx950,256,4096,7168,2304,cktile,11,0,117.9301,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1147.22,717.99,0.0 +gfx950,256,4096,7168,4096,ck,0,0,214.4016,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1121.81,489.07,0.0 +gfx950,256,4096,7168,4608,ck,0,0,228.59,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1183.7,483.94,0.0 +gfx950,256,4096,7168,16384,cktile,11,0,521.4023,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1845.16,466.57,0.0 +gfx950,256,4096,7168,18432,ck,0,0,881.3469,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1228.04,302.19,0.0 +gfx950,256,4096,8192,512,ck,0,0,56.3409,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,609.85,1302.79,0.0 +gfx950,256,4096,8192,1536,ck,0,0,105.2588,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,979.29,816.87,0.0 +gfx950,256,4096,9216,7168,cktile,28,0,296.1434,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1827.38,577.15,0.0 +gfx950,256,4096,11264,1536,ck,0,0,142.0615,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,997.69,815.62,0.0 +gfx950,256,4096,12288,1536,ck,0,0,152.3699,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1014.76,825.81,0.0 +gfx950,256,4096,14336,1536,ck,0,0,176.1737,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1023.92,827.32,0.0 +gfx950,256,4096,16384,512,ck,0,0,103.9428,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,661.13,1392.15,0.0 +gfx950,256,4096,20480,1536,ck,0,0,238.3449,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1081.2,862.28,0.0 +gfx950,256,4096,24576,1536,ck,0,0,282.6274,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1094.15,868.16,0.0 +gfx950,256,4096,32768,512,ck,14,0,207.1159,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,663.58,1387.19,0.0 +gfx950,256,4096,32768,1536,ck,0,0,374.8153,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1100.05,867.25,0.0 +gfx950,256,4096,36864,7168,cktile,28,0,1082.0215,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2000.57,550.44,0.0 +gfx950,256,6144,128,7168,ck,18,0,25.0579,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,449.93,1856.92,0.0 +gfx950,256,6144,512,7168,ck,0,0,59.4018,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,759.19,909.09,0.0 +gfx950,256,6144,1024,7168,ck,0,0,101.4656,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,888.92,630.39,0.0 +gfx950,256,6144,2112,7168,cktile,13,0,162.6521,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_4,1143.7,523.39,0.0 +gfx950,256,6144,2240,7168,cktile,13,0,167.5962,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_4,1177.23,522.81,0.0 +gfx950,256,6144,3072,1536,ck,0,0,67.303,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,861.51,771.21,0.0 +gfx950,256,6144,4096,512,ck,0,0,37.6611,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,684.26,1475.65,0.0 +gfx950,256,6144,4096,7168,cktile,11,0,179.7043,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2007.62,688.53,0.0 +gfx950,256,6144,4608,7168,ck,0,0,343.9947,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1179.89,388.65,0.0 +gfx950,256,6144,7168,256,ck,17,0,51.0329,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,441.84,1792.73,0.0 +gfx950,256,6144,7168,512,ck,0,0,70.5242,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,639.46,1345.58,0.0 +gfx950,256,6144,7168,2048,ck,0,0,173.8221,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1037.78,663.57,0.0 +gfx950,256,6144,7168,2304,ck,0,0,188.2806,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1077.84,630.71,0.0 +gfx950,256,6144,7168,4096,ck,0,0,308.4794,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1169.53,462.29,0.0 +gfx950,256,6144,7168,4608,cktile,27,0,241.4545,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1680.96,618.84,0.0 +gfx950,256,6144,7168,16384,cktile,28,0,704.2201,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2049.23,434.78,0.0 +gfx950,256,6144,7168,18432,cktile,26,0,788.6719,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2058.52,422.8,0.0 +gfx950,256,6144,8192,512,ck,0,0,76.3779,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,674.8,1414.07,0.0 +gfx950,256,6144,8192,1536,ck,0,0,147.4517,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1048.61,832.02,0.0 +gfx950,256,6144,9216,7168,cktile,26,0,417.6534,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1943.59,534.77,0.0 +gfx950,256,6144,11264,1536,ck,0,0,206.4783,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1029.65,799.85,0.0 +gfx950,256,6144,12288,1536,ck,0,0,216.1487,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1073.0,829.55,0.0 +gfx950,256,6144,14336,1536,ck,0,0,253.0583,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1069.25,820.44,0.0 +gfx950,256,6144,16384,512,ck,0,0,141.5297,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,728.32,1504.0,0.0 +gfx950,256,6144,20480,1536,ck,0,0,353.1291,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1094.63,828.46,0.0 +gfx950,256,6144,24576,1536,ck,0,0,417.451,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1111.16,836.45,0.0 +gfx950,256,6144,32768,512,ck,0,0,270.6067,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,761.84,1561.59,0.0 +gfx950,256,6144,32768,1536,ck,0,0,552.4767,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1119.46,837.0,0.0 +gfx950,256,6144,36864,7168,cktile,26,0,1548.0338,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2097.5,491.76,0.0 +gfx950,256,8192,64,7168,ck,18,0,25.7151,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,292.29,2342.11,0.0 +gfx950,256,8192,128,7168,ck,18,0,26.8171,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,560.55,2302.07,0.0 +gfx950,256,8192,576,7168,ck,0,0,89.2735,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,757.74,809.72,0.0 +gfx950,256,8192,1024,7168,ck,0,0,119.4233,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1007.0,693.65,0.0 +gfx950,256,8192,1536,7168,cktile,11,0,158.9815,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1134.65,596.9,0.0 +gfx950,256,8192,2112,7168,cktile,26,0,177.1163,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1400.4,612.38,0.0 +gfx950,256,8192,2240,7168,cktile,27,0,177.0258,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1486.04,629.72,0.0 +gfx950,256,8192,3072,1536,ck,0,0,82.3211,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,939.12,821.58,0.0 +gfx950,256,8192,4096,512,ck,0,0,52.9601,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,648.79,1385.96,0.0 +gfx950,256,8192,4096,7168,cktile,28,0,261.1689,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1841.86,594.21,0.0 +gfx950,256,8192,4608,7168,cktile,28,0,292.7567,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1848.52,571.29,0.0 +gfx950,256,8192,7168,256,cktile,10,0,54.7017,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,549.61,2218.81,0.0 +gfx950,256,8192,7168,512,ck,0,0,85.2883,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,705.02,1469.19,0.0 +gfx950,256,8192,7168,2048,ck,0,0,221.4286,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1086.21,672.44,0.0 +gfx950,256,8192,7168,2304,cktile,11,0,199.6737,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1355.13,765.4,0.0 +gfx950,256,8192,7168,4096,ck,0,0,400.2668,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1201.79,450.59,0.0 +gfx950,256,8192,7168,4608,cktile,28,0,306.6077,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1765.01,613.88,0.0 +gfx950,256,8192,7168,16384,cktile,26,0,880.476,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2185.35,419.2,0.0 +gfx950,256,8192,7168,18432,cktile,28,0,993.9915,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2177.75,402.98,0.0 +gfx950,256,8192,8192,512,ck,0,0,102.7856,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,668.57,1387.42,0.0 +gfx950,256,8192,8192,1536,ck,0,0,195.7465,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1053.19,814.23,0.0 +gfx950,256,8192,9216,7168,cktile,27,0,525.3431,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2060.24,524.94,0.0 +gfx950,256,8192,11264,1536,ck,0,0,258.0178,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1098.64,831.08,0.0 +gfx950,256,8192,12288,1536,ck,0,0,285.766,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1082.14,814.6,0.0 +gfx950,256,8192,14336,1536,ck,0,0,326.789,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1104.01,824.64,0.0 +gfx950,256,8192,16384,512,ck,0,0,186.7906,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,735.79,1504.46,0.0 +gfx950,256,8192,20480,1536,ck,0,0,462.2371,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.0,821.19,0.0 +gfx950,256,8192,24576,1536,ck,0,0,554.2004,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.98,817.37,0.0 +gfx950,256,8192,32768,512,ck,0,0,369.0041,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,744.92,1511.75,0.0 +gfx950,256,8192,32768,1536,ck,0,0,733.9936,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1123.49,817.15,0.0 +gfx950,256,8192,36864,7168,cktile,28,0,2053.303,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2108.47,451.44,0.0 +gfx950,256,10240,128,7168,ck,18,0,42.6021,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,441.07,1806.0,0.0 +gfx950,256,10240,512,7168,ck,0,0,96.4596,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,779.21,907.7,0.0 +gfx950,256,10240,1024,7168,cktile,28,0,93.1432,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1613.9,1091.99,0.0 +gfx950,256,10240,2112,7168,cktile,27,0,182.2038,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1701.63,723.33,0.0 +gfx950,256,10240,2240,7168,cktile,11,0,184.1157,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1786.02,735.04,0.0 +gfx950,256,10240,3072,1536,ck,0,0,102.0072,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,947.35,817.21,0.0 +gfx950,256,10240,4096,512,ck,0,0,65.3698,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,657.03,1395.54,0.0 +gfx950,256,10240,4096,7168,ck,0,0,489.7941,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1227.65,381.07,0.0 +gfx950,256,10240,4608,7168,cktile,27,0,346.7326,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1950.95,579.13,0.0 +gfx950,256,10240,7168,256,ck,18,0,81.5831,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,460.65,1854.02,0.0 +gfx950,256,10240,7168,512,ck,0,0,103.3089,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,727.55,1507.26,0.0 +gfx950,256,10240,7168,2048,ck,0,0,271.7203,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1106.46,671.47,0.0 +gfx950,256,10240,7168,2304,ck,0,0,291.7057,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1159.49,640.74,0.0 +gfx950,256,10240,7168,4096,cktile,11,0,335.9717,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1789.72,649.17,0.0 +gfx950,256,10240,7168,4608,cktile,27,0,363.5745,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1860.57,624.4,0.0 +gfx950,256,10240,7168,16384,cktile,28,0,1085.6779,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2215.37,397.92,0.0 +gfx950,256,10240,7168,18432,cktile,26,0,1209.4499,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2237.24,386.68,0.0 +gfx950,256,10240,8192,512,ck,0,0,118.4046,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,725.47,1496.64,0.0 +gfx950,256,10240,8192,1536,ck,0,0,238.501,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1080.49,822.15,0.0 +gfx950,256,10240,9216,7168,cktile,11,0,664.1018,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2037.21,494.21,0.0 +gfx950,256,10240,11264,1536,ck,0,0,323.467,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1095.43,815.28,0.0 +gfx950,256,10240,12288,1536,ck,0,0,353.3679,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1093.89,810.09,0.0 +gfx950,256,10240,14336,1536,ck,0,0,409.667,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1100.82,808.83,0.0 +gfx950,256,10240,16384,512,ck,0,0,225.2848,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,762.58,1549.93,0.0 +gfx950,256,10240,20480,1536,ck,0,0,576.4526,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1117.6,809.46,0.0 +gfx950,256,10240,24576,1536,ck,0,0,688.7802,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.41,808.38,0.0 +gfx950,256,10240,32768,512,ck,0,0,442.473,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,776.54,1566.44,0.0 +gfx950,256,10240,32768,1536,ck,0,0,917.7738,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1123.14,803.19,0.0 +gfx950,256,10240,36864,7168,cktile,26,0,2574.9353,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2101.67,424.33,0.0 +gfx950,256,12288,128,7168,ck,18,0,44.9391,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,501.76,2050.41,0.0 +gfx950,256,12288,512,7168,ck,0,0,102.6049,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,879.04,1016.85,0.0 +gfx950,256,12288,1024,7168,cktile,28,0,97.9679,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1841.3,1230.88,0.0 +gfx950,256,12288,2112,7168,cktile,26,0,259.1603,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1435.6,598.56,0.0 +gfx950,256,12288,2240,7168,cktile,26,0,261.1259,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1511.15,609.62,0.0 +gfx950,256,12288,3072,1536,ck,0,0,119.017,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,974.35,832.57,0.0 +gfx950,256,12288,4096,512,ck,0,0,77.2939,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,666.8,1410.87,0.0 +gfx950,256,12288,4096,7168,cktile,11,0,352.7815,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2045.33,618.24,0.0 +gfx950,256,12288,4608,7168,cktile,11,0,434.5259,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1868.13,539.34,0.0 +gfx950,256,12288,7168,256,cktile,6,0,94.972,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x32_intrawave_0x0x0x0_1,474.85,1907.31,0.0 +gfx950,256,12288,7168,512,ck,0,0,124.0238,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,727.23,1500.7,0.0 +gfx950,256,12288,7168,2048,ck,0,0,316.8378,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1138.68,681.76,0.0 +gfx950,256,12288,7168,2304,ck,0,0,339.0438,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1197.11,651.8,0.0 +gfx950,256,12288,7168,4096,cktile,27,0,395.2387,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1825.62,647.34,0.0 +gfx950,256,12288,7168,4608,cktile,11,0,425.8071,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1906.38,624.26,0.0 +gfx950,256,12288,7168,16384,cktile,28,0,1279.8588,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2255.11,386.71,0.0 +gfx950,256,12288,7168,18432,cktile,11,0,1389.9798,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2336.0,384.73,0.0 +gfx950,256,12288,8192,512,ck,0,0,138.724,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,743.05,1526.86,0.0 +gfx950,256,12288,8192,1536,ck,0,0,284.452,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1087.13,818.36,0.0 +gfx950,256,12288,9216,7168,cktile,11,0,787.3284,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2062.03,483.45,0.0 +gfx950,256,12288,11264,1536,ck,0,0,383.3207,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1109.26,816.55,0.0 +gfx950,256,12288,12288,1536,ck,0,0,419.245,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1106.41,810.36,0.0 +gfx950,256,12288,14336,1536,ck,0,0,485.2263,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.29,810.38,0.0 +gfx950,256,12288,16384,512,ck,0,0,271.3271,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,759.82,1538.12,0.0 +gfx950,256,12288,20480,1536,ck,0,0,688.6851,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.57,803.92,0.0 +gfx950,256,12288,24576,1536,ck,0,0,822.4186,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1128.03,803.24,0.0 +gfx950,256,12288,32768,512,ck,0,0,530.0213,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,777.93,1562.91,0.0 +gfx950,256,12288,32768,1536,ck,0,0,1098.3661,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1126.17,796.19,0.0 +gfx950,256,12288,36864,7168,cktile,26,0,3042.4958,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2134.43,413.57,0.0 +gfx950,256,14336,128,7168,ck,18,0,45.8225,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,574.1,2342.69,0.0 +gfx950,256,14336,512,7168,ck,0,0,117.1634,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,898.12,1033.69,0.0 +gfx950,256,14336,1024,7168,ck,0,0,203.2423,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1035.48,686.18,0.0 +gfx950,256,14336,2112,7168,cktile,28,0,264.6636,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1640.04,674.27,0.0 +gfx950,256,14336,2240,7168,cktile,27,0,265.5412,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1733.69,689.32,0.0 +gfx950,256,14336,3072,1536,ck,0,0,140.8499,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,960.54,815.19,0.0 +gfx950,256,14336,4096,512,ck,0,0,89.2808,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,673.49,1421.11,0.0 +gfx950,256,14336,4096,7168,cktile,28,0,431.6564,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1950.19,578.15,0.0 +gfx950,256,14336,4608,7168,cktile,28,0,499.0381,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1897.73,536.86,0.0 +gfx950,256,14336,7168,256,ck,17,0,110.8353,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,474.7,1903.96,0.0 +gfx950,256,14336,7168,512,ck,0,0,143.1255,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,735.21,1512.87,0.0 +gfx950,256,14336,7168,2048,ck,0,0,373.5706,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1126.71,668.04,0.0 +gfx950,256,14336,7168,2304,ck,0,0,401.7208,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1178.73,634.93,0.0 +gfx950,256,14336,7168,4096,cktile,11,0,451.527,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1864.37,650.24,0.0 +gfx950,256,14336,7168,4608,cktile,26,0,491.627,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1926.34,619.6,0.0 +gfx950,256,14336,7168,16384,cktile,26,0,1445.403,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2329.63,385.94,0.0 +gfx950,256,14336,7168,18432,cktile,26,0,1627.8742,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2327.06,369.74,0.0 +gfx950,256,14336,8192,512,ck,0,0,160.1493,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,750.92,1538.66,0.0 +gfx950,256,14336,8192,1536,ck,0,0,330.5132,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1091.57,815.35,0.0 +gfx950,256,14336,9216,7168,cktile,11,0,886.8712,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2135.69,488.3,0.0 +gfx950,256,14336,11264,1536,ck,0,0,446.7232,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1110.46,810.98,0.0 +gfx950,256,14336,12288,1536,ck,0,0,485.1467,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.47,810.51,0.0 +gfx950,256,14336,14336,1536,ck,0,0,568.5764,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1110.42,800.39,0.0 +gfx950,256,14336,16384,512,ck,0,0,311.9278,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,771.07,1556.42,0.0 +gfx950,256,14336,20480,1536,ck,0,0,805.6795,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1119.48,795.2,0.0 +gfx950,256,14336,24576,1536,ck,0,0,961.2433,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1125.97,795.23,0.0 +gfx950,256,14336,32768,512,ck,0,0,610.1813,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,788.35,1579.27,0.0 +gfx950,256,14336,32768,1536,ck,0,0,1286.8515,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1121.43,786.32,0.0 +gfx950,256,14336,36864,7168,cktile,11,0,3573.2083,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2120.31,398.51,0.0 +gfx950,256,16384,64,7168,ck,18,0,32.1288,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,467.88,3734.86,0.0 +gfx950,256,16384,128,7168,ck,18,0,49.2013,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,611.06,2490.83,0.0 +gfx950,256,16384,576,7168,ck,0,0,153.6456,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,880.54,914.08,0.0 +gfx950,256,16384,1024,7168,ck,0,0,216.1688,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1112.64,732.46,0.0 +gfx950,256,16384,1536,7168,cktile,11,0,256.6513,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1405.71,696.6,0.0 +gfx950,256,16384,2112,7168,cktile,11,0,306.4497,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1618.76,658.46,0.0 +gfx950,256,16384,2240,7168,cktile,11,0,307.9861,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1708.3,671.77,0.0 +gfx950,256,16384,3072,1536,ck,0,0,155.5478,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,994.03,839.28,0.0 +gfx950,256,16384,4096,512,ck,0,0,101.0461,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,680.08,1432.05,0.0 +gfx950,256,16384,4096,7168,cktile,27,0,511.7685,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1879.9,549.11,0.0 +gfx950,256,16384,4608,7168,cktile,26,0,520.5986,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2079.01,579.07,0.0 +gfx950,256,16384,6144,1536,ck,0,0,292.1834,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1058.37,807.47,0.0 +gfx950,256,16384,7168,256,cktile,10,0,101.2722,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,593.74,2378.84,0.0 +gfx950,256,16384,7168,512,ck,0,0,164.8841,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,729.36,1497.66,0.0 +gfx950,256,16384,7168,2048,ck,0,0,421.5322,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1141.16,671.63,0.0 +gfx950,256,16384,7168,2304,cktile,11,0,371.4356,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1456.96,778.45,0.0 +gfx950,256,16384,7168,4096,cktile,27,0,520.6164,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1847.95,636.46,0.0 +gfx950,256,16384,7168,4608,cktile,26,0,560.4838,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1931.07,612.7,0.0 +gfx950,256,16384,7168,16384,cktile,28,0,1682.9186,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2286.68,368.86,0.0 +gfx950,256,16384,7168,18432,cktile,11,0,1877.5648,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2305.82,356.31,0.0 +gfx950,256,16384,8192,512,ck,0,0,187.5611,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,732.77,1498.28,0.0 +gfx950,256,16384,8192,1536,ck,0,0,379.4481,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1086.62,806.92,0.0 +gfx950,256,16384,9216,7168,cktile,26,0,1028.9257,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2103.81,471.84,0.0 +gfx950,256,16384,11264,1536,ck,0,0,507.1159,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1117.96,811.58,0.0 +gfx950,256,16384,12288,1536,ck,14,0,631.8107,a8w8_blockscale_1x128x128_256x64x256x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,978.89,707.01,0.0 +gfx950,256,16384,14336,1536,ck,0,0,643.287,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1121.67,803.6,0.0 +gfx950,256,16384,16384,512,ck,0,0,353.2954,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,778.04,1567.1,0.0 +gfx950,256,16384,20480,1536,ck,0,0,912.096,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1130.14,797.85,0.0 +gfx950,256,16384,24576,1536,ck,0,0,1102.0997,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.36,787.79,0.0 +gfx950,256,16384,32768,512,ck,0,0,709.5617,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,774.78,1548.71,0.0 +gfx950,256,16384,32768,1536,ck,0,0,1460.4078,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1129.32,786.93,0.0 +gfx950,256,16384,36864,7168,cktile,26,0,4061.1773,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2132.06,391.42,0.0 +gfx950,256,20480,576,7168,cktile,13,0,165.4454,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_4,1022.18,1054.86,0.0 +gfx950,256,20480,1536,7168,cktile,11,0,295.5655,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1525.79,746.79,0.0 +gfx950,256,20480,3072,1536,ck,0,0,186.8642,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1034.3,866.97,0.0 +gfx950,256,20480,4096,512,ck,0,0,119.3033,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,720.01,1511.74,0.0 +gfx950,256,20480,4608,7168,cktile,27,0,699.6189,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1933.79,526.82,0.0 +gfx950,256,20480,7168,256,cktile,10,0,125.313,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,599.79,2399.43,0.0 +gfx950,256,20480,7168,2048,ck,0,0,525.1612,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1144.97,666.89,0.0 +gfx950,256,20480,7168,2304,cktile,11,0,470.5953,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1437.45,759.26,0.0 +gfx950,256,32768,64,7168,ck,18,0,58.767,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,511.59,4076.0,0.0 +gfx950,256,32768,128,7168,ck,0,0,74.5115,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,806.98,3277.17,0.0 +gfx950,256,32768,512,7168,ck,0,0,221.9247,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1083.78,1226.12,0.0 +gfx950,256,32768,576,7168,cktile,11,0,223.8974,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1208.51,1236.1,0.0 +gfx950,256,32768,1024,7168,cktile,26,0,271.8378,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1769.57,1137.92,0.0 +gfx950,256,32768,1536,7168,cktile,11,0,384.6335,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1875.95,901.0,0.0 +gfx950,256,32768,2112,7168,cktile,27,0,534.9146,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1854.76,726.16,0.0 +gfx950,256,32768,2240,7168,cktile,26,0,541.7648,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1942.29,734.15,0.0 +gfx950,256,32768,3072,1536,ck,0,0,287.9805,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1073.81,890.26,0.0 +gfx950,256,32768,4096,512,ck,0,0,185.2578,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,741.88,1550.86,0.0 +gfx950,256,32768,4096,7168,cktile,28,0,951.6322,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2021.94,559.75,0.0 +gfx950,256,32768,4608,7168,cktile,28,0,1052.2794,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2057.12,541.59,0.0 +gfx950,256,32768,6144,1536,ck,0,0,556.7669,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1110.83,830.55,0.0 +gfx950,256,32768,7168,256,ck,17,0,239.5893,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,501.94,2003.37,0.0 +gfx950,256,32768,7168,512,ck,0,0,312.6371,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,769.32,1567.98,0.0 +gfx950,256,32768,7168,2048,ck,0,0,839.2502,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1146.35,657.19,0.0 +gfx950,256,32768,7168,2304,ck,0,0,901.4207,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1200.7,623.21,0.0 +gfx950,256,32768,7168,4096,cktile,28,0,1008.0074,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1908.86,628.31,0.0 +gfx950,256,32768,7168,4608,cktile,26,0,1082.5675,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1999.56,603.92,0.0 +gfx950,256,32768,7168,16384,cktile,26,0,3378.3991,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2278.17,332.72,0.0 +gfx950,256,32768,7168,18432,cktile,26,0,3742.2158,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2313.78,322.23,0.0 +gfx950,256,32768,8192,512,ck,0,0,360.9476,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,761.55,1545.49,0.0 +gfx950,256,32768,8192,1536,ck,0,0,737.4624,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1118.2,813.31,0.0 +gfx950,256,32768,9216,7168,cktile,27,0,1959.5649,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2209.33,461.8,0.0 +gfx950,256,32768,11264,1536,ck,0,0,1005.4332,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1127.74,801.48,0.0 +gfx950,256,32768,12288,1536,ck,0,0,1102.1362,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.32,793.47,0.0 +gfx950,256,32768,14336,1536,ck,0,0,1285.2161,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.85,787.32,0.0 +gfx950,256,32768,16384,512,ck,0,0,696.8117,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,788.96,1577.05,0.0 +gfx950,256,32768,20480,1536,ck,0,0,1826.5515,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1128.68,779.59,0.0 +gfx950,256,32768,24576,1536,ck,0,0,2191.9056,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1128.65,774.98,0.0 +gfx950,256,32768,32768,512,ck,0,0,1385.222,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,793.74,1574.5,0.0 +gfx950,256,32768,32768,1536,ck,0,0,2918.4955,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1130.22,770.31,0.0 +gfx950,256,32768,36864,7168,cktile,28,0,8025.7332,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2157.72,363.21,0.0 +gfx950,256,65536,64,7168,ck,18,0,116.7699,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,514.94,4098.74,0.0 +gfx950,256,65536,128,7168,ck,0,0,150.0461,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,801.48,3248.71,0.0 +gfx950,256,65536,512,7168,cktile,28,0,277.0424,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1736.33,1951.11,0.0 +gfx950,256,65536,576,7168,cktile,27,0,381.3612,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1419.04,1440.6,0.0 +gfx950,256,65536,1024,7168,cktile,27,0,535.6356,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1796.13,1141.3,0.0 +gfx950,256,65536,1536,7168,cktile,27,0,708.5099,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2036.82,962.72,0.0 +gfx950,256,65536,2112,7168,cktile,11,0,1048.9318,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1891.71,726.19,0.0 +gfx950,256,65536,2240,7168,cktile,11,0,1052.852,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1998.89,740.29,0.0 +gfx950,256,65536,3072,1536,ck,0,0,566.2254,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1092.28,897.23,0.0 +gfx950,256,65536,4096,512,ck,0,0,357.1801,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,769.58,1602.9,0.0 +gfx950,256,65536,4096,7168,cktile,26,0,1875.2217,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2052.18,552.46,0.0 +gfx950,256,65536,4608,7168,cktile,27,0,2093.6777,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2067.81,528.63,0.0 +gfx950,256,65536,6144,1536,ck,0,0,1111.303,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1113.06,823.72,0.0 +gfx950,256,65536,7168,256,ck,17,0,469.6057,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,512.17,2040.3,0.0 +gfx950,256,65536,7168,512,ck,0,0,621.2778,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,774.27,1572.16,0.0 +gfx950,256,65536,7168,2048,ck,0,0,1669.8174,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1152.31,651.82,0.0 +gfx950,256,65536,7168,2304,ck,0,0,1801.7363,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1201.43,614.43,0.0 +gfx950,256,65536,7168,4096,cktile,27,0,1951.4105,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1972.06,634.06,0.0 +gfx950,256,65536,7168,4608,cktile,11,0,2159.6204,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2004.67,590.17,0.0 +gfx950,256,65536,7168,9216,cktile,11,0,3868.2373,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,2238.4,416.1,0.0 +gfx950,256,65536,7168,16384,cktile,26,0,6897.8894,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2231.58,308.89,0.0 +gfx950,256,65536,7168,18432,cktile,28,0,7476.3659,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2316.27,304.91,0.0 +gfx950,256,65536,8192,512,ck,0,0,704.0693,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,780.83,1578.67,0.0 +gfx950,256,65536,8192,1536,ck,0,0,1473.86,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1119.01,805.36,0.0 +gfx950,256,65536,9216,7168,cktile,27,0,3989.8626,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2170.16,437.05,0.0 +gfx950,256,65536,11264,1536,ck,0,0,2012.9804,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1126.56,792.04,0.0 +gfx950,256,65536,12288,1536,ck,0,0,2193.6639,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1127.75,788.7,0.0 +gfx950,256,65536,14336,1536,ck,0,0,2557.2449,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1128.64,782.77,0.0 +gfx950,256,65536,16384,512,ck,0,0,1381.4369,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,795.92,1584.89,0.0 +gfx950,256,65536,20480,1536,ck,0,0,3667.0687,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1124.38,768.05,0.0 +gfx950,256,65536,24576,1536,ck,0,0,4395.2438,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1125.72,764.38,0.0 +gfx950,256,98304,7168,4096,cktile,28,0,2962.6601,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1948.4,621.5,0.0 +gfx950,256,98304,7168,4608,cktile,26,0,3189.2174,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2036.23,594.28,0.0 +gfx950,256,98304,7168,9216,cktile,27,0,5713.6008,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2273.17,416.78,0.0 +gfx950,256,98304,7168,16384,cktile,26,0,10375.8827,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2225.33,302.37,0.0 +gfx950,256,98304,7168,18432,cktile,27,0,11252.7048,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2308.42,298.0,0.0 +gfx950,256,98304,8192,512,ck,0,0,1045.7319,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,788.57,1592.32,0.0 +gfx950,256,98304,8192,1536,ck,0,0,2210.3452,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1119.24,802.68,0.0 +gfx950,256,98304,9216,7168,cktile,26,0,5845.0573,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2222.05,441.85,0.0 +gfx950,256,98304,11264,1536,ck,0,0,3016.2181,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1127.77,790.03,0.0 +gfx950,256,98304,12288,1536,ck,0,0,3304.3299,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1123.03,782.55,0.0 +gfx950,256,98304,14336,1536,ck,0,0,3862.1384,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1120.97,774.59,0.0 +gfx950,256,98304,16384,512,ck,0,0,2069.8943,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,796.79,1584.6,0.0 +gfx950,256,98304,20480,1536,ck,0,0,5509.1484,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.63,764.0,0.0 +gfx950,256,131072,128,7168,ck,0,0,293.0508,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,820.74,3323.64,0.0 +gfx950,256,131072,512,7168,cktile,11,0,545.9348,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1762.25,1973.52,0.0 +gfx950,256,131072,576,7168,cktile,27,0,712.3688,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1519.34,1536.63,0.0 +gfx950,256,131072,1024,7168,cktile,26,0,982.8131,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1957.79,1236.55,0.0 +gfx950,256,131072,1536,7168,cktile,27,0,1418.7228,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2034.38,953.81,0.0 +gfx950,256,131072,2112,7168,cktile,28,0,2108.0457,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1882.57,715.5,0.0 +gfx950,256,131072,2240,7168,cktile,27,0,2103.3064,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2001.17,733.5,0.0 +gfx950,256,131072,3072,1536,ck,0,0,1129.3765,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1095.25,895.5,0.0 +gfx950,256,131072,4096,512,ck,0,0,712.1968,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,771.92,1604.82,0.0 +gfx950,256,131072,4096,7168,cktile,27,0,3725.4959,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2065.92,548.28,0.0 +gfx950,256,131072,6144,1536,ck,0,0,2213.8353,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1117.47,822.72,0.0 +gfx950,256,131072,7168,256,ck,17,0,935.1344,a8w8_blockscale_1x128x128_256x64x128x256_16x16_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,514.4,2047.23,0.0 +gfx950,256,131072,7168,512,ck,0,0,1234.1151,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,779.56,1579.94,0.0 +gfx950,256,131072,7168,2048,ck,0,0,3347.1523,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1149.72,645.97,0.0 +gfx950,256,131072,7168,4096,cktile,12,0,3923.2429,a8w8_blockscale_cktile_16x128x256_1x4x1_16x16x128_intrawave_0x1x0_3,1961.79,623.28,0.0 +gfx950,256,131072,7168,4608,cktile,26,0,4209.0554,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2057.15,597.77,0.0 +gfx950,256,131072,7168,9216,cktile,28,0,7568.5865,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,2288.05,416.6,0.0 +gfx950,256,131072,7168,16384,cktile,27,0,13692.8484,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2248.35,302.64,0.0 +gfx950,256,131072,7168,18432,cktile,26,0,15254.4732,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2270.46,290.22,0.0 +gfx950,256,131072,8192,512,ck,0,0,1408.0054,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,780.9,1575.84,0.0 +gfx950,256,131072,8192,1536,ck,0,0,2939.9522,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1121.97,803.21,0.0 +gfx950,256,131072,9216,7168,cktile,26,0,7924.4831,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,2185.29,431.76,0.0 +gfx950,256,131072,12288,1536,ck,0,0,4404.9134,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1123.25,781.27,0.0 +gfx950,256,131072,14336,1536,ck,0,0,5136.8917,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1123.72,775.07,0.0 diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_235b.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_235b.csv index 3d44184021..a12064b753 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_235b.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_235b.csv @@ -1,129 +1,129 @@ -cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,1,1280,4096,ck,8,0,14.2236,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.74,369.07,0.0 -256,2,1280,4096,ck,8,0,14.2731,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.47,368.26,0.0 -256,4,1280,4096,ck,8,0,14.3084,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.93,368.28,0.0 -256,8,1280,4096,ck,8,0,14.3739,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.84,368.45,0.0 -256,16,1280,4096,ck,8,0,13.8774,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.09,385.47,0.0 -256,32,1280,4096,ck,8,0,14.0399,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.9,388.6,0.0 -256,64,1280,4096,ck,8,0,13.9962,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.95,405.03,0.0 -256,128,1280,4096,ck,8,0,13.8253,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,97.08,440.85,0.0 -256,256,1280,4096,ck,18,0,17.2314,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,155.78,403.15,0.0 -256,512,1280,4096,ck,18,0,17.5233,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,306.38,493.67,0.0 -256,1024,1280,4096,ck,2,0,26.4698,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,405.65,455.56,0.0 -256,2048,1280,4096,ck,0,0,34.1317,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,629.18,552.99,0.0 -256,4096,1280,4096,ck,0,0,56.3693,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,761.93,576.66,0.0 -256,8192,1280,4096,cktile,28,0,61.1192,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1405.44,977.91,0.0 -256,16384,1280,4096,cktile,11,0,130.9265,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1312.18,872.97,0.0 -256,32768,1280,4096,cktile,11,0,249.9686,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1374.56,893.5,0.0 -256,1,2304,4096,ck,8,0,14.3098,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.32,660.1,0.0 -256,2,2304,4096,ck,8,0,14.4682,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.61,653.47,0.0 -256,4,2304,4096,ck,8,0,14.4783,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.21,654.22,0.0 -256,8,2304,4096,ck,8,0,14.6171,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.33,650.39,0.0 -256,16,2304,4096,ck,8,0,14.0267,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.53,682.73,0.0 -256,32,2304,4096,ck,8,0,14.1174,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,42.78,688.21,0.0 -256,64,2304,4096,ck,8,0,14.1756,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,85.21,705.03,0.0 -256,128,2304,4096,ck,18,0,17.3633,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,139.14,607.68,0.0 -256,256,2304,4096,ck,18,0,17.5033,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,276.05,666.47,0.0 -256,512,2304,4096,ck,2,0,26.651,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,362.6,521.32,0.0 -256,1024,2304,4096,ck,0,0,33.5447,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,576.17,547.03,0.0 -256,2048,2304,4096,ck,0,0,55.8113,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,692.6,488.48,0.0 -256,4096,2304,4096,cktile,26,0,59.4491,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1300.43,758.44,0.0 -256,8192,2304,4096,cktile,11,0,123.7144,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1249.8,652.64,0.0 -256,16384,2304,4096,cktile,11,0,219.4733,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1409.0,692.77,0.0 -256,32768,2304,4096,cktile,27,0,351.8642,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1757.71,837.4,0.0 -256,1,4096,1024,ck,8,0,5.1545,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.63,815.5,0.0 -256,1,4096,2048,ck,8,0,8.9204,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.88,941.53,0.0 -256,1,4096,4096,ck,8,0,14.2026,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.36,1182.14,0.0 -256,1,4096,8192,ck,8,0,26.1922,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.56,1281.71,0.0 -256,2,4096,1024,ck,8,0,5.2133,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.22,808.07,0.0 -256,2,4096,2048,ck,8,0,8.8069,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.81,954.83,0.0 -256,2,4096,4096,ck,8,0,14.2267,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.72,1181.0,0.0 -256,2,4096,8192,ck,8,0,26.2327,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.12,1280.36,0.0 -256,4,4096,1024,ck,8,0,5.2609,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.38,804.27,0.0 -256,4,4096,2048,ck,8,0,8.7247,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.69,966.17,0.0 -256,4,4096,4096,ck,8,0,14.1693,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.47,1187.52,0.0 -256,4,4096,8192,ck,8,0,26.2963,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.21,1278.51,0.0 -256,8,4096,1024,ck,8,0,5.2635,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.75,810.87,0.0 -256,8,4096,2048,ck,8,0,9.0753,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.79,933.36,0.0 -256,8,4096,4096,ck,8,0,14.2173,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.88,1186.97,0.0 -256,8,4096,8192,ck,8,0,26.281,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.43,1281.74,0.0 -256,16,4096,1024,ck,8,0,5.2225,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.7,831.36,0.0 -256,16,4096,2048,ck,8,0,8.0859,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,33.2,1057.7,0.0 -256,16,4096,4096,ck,8,0,13.8171,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,38.86,1228.47,0.0 -256,16,4096,8192,ck,8,0,25.5699,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.99,1322.51,0.0 -256,32,4096,1024,ck,8,0,5.4079,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.64,830.12,0.0 -256,32,4096,2048,ck,8,0,8.6346,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,62.18,1009.46,0.0 -256,32,4096,4096,ck,8,0,14.02,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,76.59,1224.71,0.0 -256,32,4096,8192,ck,8,0,25.5833,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,83.94,1332.07,0.0 -256,64,4096,1024,ck,8,0,5.5034,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,97.55,869.3,0.0 -256,64,4096,2048,ck,8,0,9.0476,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,118.68,999.6,0.0 -256,64,4096,4096,ck,8,0,14.1143,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,152.15,1244.39,0.0 -256,64,4096,8192,ck,8,0,25.726,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,166.95,1345.06,0.0 -256,128,4096,1024,ck,7,0,6.3161,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,170.0,850.83,0.0 -256,128,4096,2048,ck,7,0,10.2519,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,209.47,946.1,0.0 -256,128,4096,4096,ck,18,0,17.4259,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,246.47,1053.03,0.0 -256,128,4096,8192,ck,18,0,31.0562,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,276.59,1147.97,0.0 -256,256,4096,1024,ck,18,0,6.7906,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,316.24,965.1,0.0 -256,256,4096,2048,ck,18,0,10.7498,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,399.54,1024.21,0.0 -256,256,4096,4096,ck,18,0,17.9977,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,477.28,1106.97,0.0 -256,256,4096,8192,ck,18,0,33.2755,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,516.29,1134.43,0.0 -256,512,4096,1024,ck,2,0,10.6299,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,404.05,838.47,0.0 -256,512,4096,2048,ck,2,0,16.393,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,524.0,831.54,0.0 -256,512,4096,4096,ck,2,0,28.3403,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,606.2,813.99,0.0 -256,512,4096,8192,ck,2,0,51.5612,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,666.39,813.46,0.0 -256,1024,4096,1024,ck,0,0,14.5662,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,589.72,935.83,0.0 -256,1024,4096,2048,ck,0,0,22.1819,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,774.5,850.89,0.0 -256,1024,4096,4096,ck,0,0,40.44,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,849.65,726.02,0.0 -256,1024,4096,8192,ck,0,0,70.6309,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,972.94,712.6,0.0 -256,2048,4096,1024,ck,0,0,24.0096,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,715.54,960.81,0.0 -256,2048,4096,2048,ck,0,0,35.5692,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,966.0,825.44,0.0 -256,2048,4096,4096,cktile,11,0,59.7429,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1150.25,702.06,0.0 -256,2048,4096,8192,cktile,11,0,105.2753,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1305.52,637.46,0.0 -256,4096,4096,1024,ck,0,0,45.9216,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,748.23,913.36,0.0 -256,4096,4096,2048,cktile,11,0,76.3082,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,900.55,659.58,0.0 -256,4096,4096,4096,cktile,11,0,115.8759,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1186.09,579.14,0.0 -256,4096,4096,8192,cktile,11,0,199.5552,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1377.45,504.44,0.0 -256,8192,4096,1024,cktile,11,0,80.1954,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,856.9,993.72,0.0 -256,8192,4096,2048,cktile,11,0,116.4096,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1180.65,792.67,0.0 -256,8192,4096,4096,cktile,11,0,185.5059,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1481.77,633.08,0.0 -256,8192,4096,8192,cktile,28,0,291.6666,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1884.88,575.22,0.0 -256,16384,4096,1024,cktile,11,0,154.0943,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,891.91,1007.11,0.0 -256,16384,4096,2048,cktile,11,0,226.409,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1214.08,778.06,0.0 -256,16384,4096,4096,cktile,11,0,356.4876,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1542.15,611.81,0.0 -256,16384,4096,8192,cktile,27,0,577.0085,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1905.54,523.37,0.0 -256,32768,4096,1024,cktile,11,0,285.7822,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,961.84,1071.39,0.0 -256,32768,4096,2048,cktile,11,0,419.1778,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1311.51,820.49,0.0 -256,32768,4096,4096,cktile,28,0,623.6529,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1763.02,672.54,0.0 -256,32768,4096,8192,cktile,27,0,1065.5447,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2063.76,535.34,0.0 -256,1,4608,4096,ck,8,0,14.2605,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.65,1324.48,0.0 -256,2,4608,4096,ck,8,0,14.3267,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.27,1319.28,0.0 -256,4,4608,4096,ck,8,0,14.3329,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.53,1320.57,0.0 -256,8,4608,4096,ck,8,0,14.3937,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.98,1318.69,0.0 -256,16,4608,4096,ck,8,0,13.9524,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.29,1368.03,0.0 -256,32,4608,4096,ck,8,0,14.0352,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,86.07,1375.14,0.0 -256,64,4608,4096,ck,18,0,17.0643,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,141.58,1156.0,0.0 -256,128,4608,4096,ck,18,0,17.5292,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,275.65,1173.94,0.0 -256,256,4608,4096,ck,2,0,27.1182,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,356.35,821.67,0.0 -256,512,4608,4096,ck,0,0,34.1289,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,566.3,752.74,0.0 -256,1024,4608,4096,cktile,11,0,55.4868,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,696.65,585.83,0.0 -256,2048,4608,4096,cktile,11,0,57.9299,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1334.53,796.43,0.0 -256,4096,4608,4096,cktile,11,0,122.8584,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1258.51,597.44,0.0 -256,8192,4608,4096,cktile,11,0,210.2934,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1470.51,608.32,0.0 -256,16384,4608,4096,cktile,27,0,339.4407,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1822.04,698.14,0.0 -256,32768,4608,4096,cktile,11,0,751.9367,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1645.02,605.21,0.0 -256,1,9216,4096,ck,8,0,15.1096,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.0,2499.82,0.0 -256,2,9216,4096,ck,8,0,15.1686,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.95,2491.58,0.0 -256,4,9216,4096,ck,8,0,15.1452,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.94,2498.41,0.0 -256,8,9216,4096,ck,8,0,15.1777,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,39.79,2498.99,0.0 -256,16,9216,4096,ck,8,0,14.8504,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,81.34,2566.21,0.0 -256,32,9216,4096,ck,7,0,18.1776,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,132.91,2116.32,0.0 -256,64,9216,4096,ck,18,0,18.1853,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,265.7,2155.07,0.0 -256,128,9216,4096,ck,2,0,28.0005,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,345.13,1451.13,0.0 -256,256,9216,4096,ck,0,0,35.5904,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,543.05,1222.69,0.0 -256,512,9216,4096,cktile,11,0,54.5189,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,709.01,903.96,0.0 -256,1024,9216,4096,cktile,26,0,57.7557,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1338.56,1053.01,0.0 -256,2048,9216,4096,cktile,11,0,120.3443,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1284.8,697.05,0.0 -256,4096,9216,4096,cktile,11,0,203.3908,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1520.41,639.28,0.0 -256,8192,9216,4096,cktile,27,0,340.9893,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1813.77,651.92,0.0 -256,16384,9216,4096,cktile,28,0,658.8252,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1877.51,617.53,0.0 -256,32768,9216,4096,cktile,28,0,1281.386,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1930.64,605.55,0.0 +gfx,cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,1,1280,4096,ck,8,0,14.2236,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.74,369.07,0.0 +gfx950,256,2,1280,4096,ck,8,0,14.2731,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.47,368.26,0.0 +gfx950,256,4,1280,4096,ck,8,0,14.3084,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.93,368.28,0.0 +gfx950,256,8,1280,4096,ck,8,0,14.3739,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.84,368.45,0.0 +gfx950,256,16,1280,4096,ck,8,0,13.8774,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.09,385.47,0.0 +gfx950,256,32,1280,4096,ck,8,0,14.0399,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.9,388.6,0.0 +gfx950,256,64,1280,4096,ck,8,0,13.9962,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.95,405.03,0.0 +gfx950,256,128,1280,4096,ck,8,0,13.8253,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,97.08,440.85,0.0 +gfx950,256,256,1280,4096,ck,18,0,17.2314,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,155.78,403.15,0.0 +gfx950,256,512,1280,4096,ck,18,0,17.5233,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,306.38,493.67,0.0 +gfx950,256,1024,1280,4096,ck,2,0,26.4698,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,405.65,455.56,0.0 +gfx950,256,2048,1280,4096,ck,0,0,34.1317,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,629.18,552.99,0.0 +gfx950,256,4096,1280,4096,ck,0,0,56.3693,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,761.93,576.66,0.0 +gfx950,256,8192,1280,4096,cktile,28,0,61.1192,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1405.44,977.91,0.0 +gfx950,256,16384,1280,4096,cktile,11,0,130.9265,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1312.18,872.97,0.0 +gfx950,256,32768,1280,4096,cktile,11,0,249.9686,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1374.56,893.5,0.0 +gfx950,256,1,2304,4096,ck,8,0,14.3098,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.32,660.1,0.0 +gfx950,256,2,2304,4096,ck,8,0,14.4682,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.61,653.47,0.0 +gfx950,256,4,2304,4096,ck,8,0,14.4783,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.21,654.22,0.0 +gfx950,256,8,2304,4096,ck,8,0,14.6171,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.33,650.39,0.0 +gfx950,256,16,2304,4096,ck,8,0,14.0267,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.53,682.73,0.0 +gfx950,256,32,2304,4096,ck,8,0,14.1174,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,42.78,688.21,0.0 +gfx950,256,64,2304,4096,ck,8,0,14.1756,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,85.21,705.03,0.0 +gfx950,256,128,2304,4096,ck,18,0,17.3633,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,139.14,607.68,0.0 +gfx950,256,256,2304,4096,ck,18,0,17.5033,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,276.05,666.47,0.0 +gfx950,256,512,2304,4096,ck,2,0,26.651,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,362.6,521.32,0.0 +gfx950,256,1024,2304,4096,ck,0,0,33.5447,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,576.17,547.03,0.0 +gfx950,256,2048,2304,4096,ck,0,0,55.8113,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,692.6,488.48,0.0 +gfx950,256,4096,2304,4096,cktile,26,0,59.4491,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1300.43,758.44,0.0 +gfx950,256,8192,2304,4096,cktile,11,0,123.7144,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1249.8,652.64,0.0 +gfx950,256,16384,2304,4096,cktile,11,0,219.4733,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1409.0,692.77,0.0 +gfx950,256,32768,2304,4096,cktile,27,0,351.8642,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1757.71,837.4,0.0 +gfx950,256,1,4096,1024,ck,8,0,5.1545,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.63,815.5,0.0 +gfx950,256,1,4096,2048,ck,8,0,8.9204,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.88,941.53,0.0 +gfx950,256,1,4096,4096,ck,8,0,14.2026,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.36,1182.14,0.0 +gfx950,256,1,4096,8192,ck,8,0,26.1922,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.56,1281.71,0.0 +gfx950,256,2,4096,1024,ck,8,0,5.2133,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.22,808.07,0.0 +gfx950,256,2,4096,2048,ck,8,0,8.8069,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.81,954.83,0.0 +gfx950,256,2,4096,4096,ck,8,0,14.2267,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.72,1181.0,0.0 +gfx950,256,2,4096,8192,ck,8,0,26.2327,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.12,1280.36,0.0 +gfx950,256,4,4096,1024,ck,8,0,5.2609,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.38,804.27,0.0 +gfx950,256,4,4096,2048,ck,8,0,8.7247,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.69,966.17,0.0 +gfx950,256,4,4096,4096,ck,8,0,14.1693,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.47,1187.52,0.0 +gfx950,256,4,4096,8192,ck,8,0,26.2963,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.21,1278.51,0.0 +gfx950,256,8,4096,1024,ck,8,0,5.2635,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.75,810.87,0.0 +gfx950,256,8,4096,2048,ck,8,0,9.0753,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.79,933.36,0.0 +gfx950,256,8,4096,4096,ck,8,0,14.2173,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.88,1186.97,0.0 +gfx950,256,8,4096,8192,ck,8,0,26.281,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.43,1281.74,0.0 +gfx950,256,16,4096,1024,ck,8,0,5.2225,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.7,831.36,0.0 +gfx950,256,16,4096,2048,ck,8,0,8.0859,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,33.2,1057.7,0.0 +gfx950,256,16,4096,4096,ck,8,0,13.8171,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,38.86,1228.47,0.0 +gfx950,256,16,4096,8192,ck,8,0,25.5699,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.99,1322.51,0.0 +gfx950,256,32,4096,1024,ck,8,0,5.4079,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.64,830.12,0.0 +gfx950,256,32,4096,2048,ck,8,0,8.6346,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,62.18,1009.46,0.0 +gfx950,256,32,4096,4096,ck,8,0,14.02,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,76.59,1224.71,0.0 +gfx950,256,32,4096,8192,ck,8,0,25.5833,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,83.94,1332.07,0.0 +gfx950,256,64,4096,1024,ck,8,0,5.5034,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,97.55,869.3,0.0 +gfx950,256,64,4096,2048,ck,8,0,9.0476,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,118.68,999.6,0.0 +gfx950,256,64,4096,4096,ck,8,0,14.1143,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,152.15,1244.39,0.0 +gfx950,256,64,4096,8192,ck,8,0,25.726,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,166.95,1345.06,0.0 +gfx950,256,128,4096,1024,ck,7,0,6.3161,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,170.0,850.83,0.0 +gfx950,256,128,4096,2048,ck,7,0,10.2519,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,209.47,946.1,0.0 +gfx950,256,128,4096,4096,ck,18,0,17.4259,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,246.47,1053.03,0.0 +gfx950,256,128,4096,8192,ck,18,0,31.0562,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,276.59,1147.97,0.0 +gfx950,256,256,4096,1024,ck,18,0,6.7906,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,316.24,965.1,0.0 +gfx950,256,256,4096,2048,ck,18,0,10.7498,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,399.54,1024.21,0.0 +gfx950,256,256,4096,4096,ck,18,0,17.9977,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,477.28,1106.97,0.0 +gfx950,256,256,4096,8192,ck,18,0,33.2755,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,516.29,1134.43,0.0 +gfx950,256,512,4096,1024,ck,2,0,10.6299,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,404.05,838.47,0.0 +gfx950,256,512,4096,2048,ck,2,0,16.393,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,524.0,831.54,0.0 +gfx950,256,512,4096,4096,ck,2,0,28.3403,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,606.2,813.99,0.0 +gfx950,256,512,4096,8192,ck,2,0,51.5612,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,666.39,813.46,0.0 +gfx950,256,1024,4096,1024,ck,0,0,14.5662,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,589.72,935.83,0.0 +gfx950,256,1024,4096,2048,ck,0,0,22.1819,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,774.5,850.89,0.0 +gfx950,256,1024,4096,4096,ck,0,0,40.44,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,849.65,726.02,0.0 +gfx950,256,1024,4096,8192,ck,0,0,70.6309,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,972.94,712.6,0.0 +gfx950,256,2048,4096,1024,ck,0,0,24.0096,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,715.54,960.81,0.0 +gfx950,256,2048,4096,2048,ck,0,0,35.5692,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,966.0,825.44,0.0 +gfx950,256,2048,4096,4096,cktile,11,0,59.7429,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1150.25,702.06,0.0 +gfx950,256,2048,4096,8192,cktile,11,0,105.2753,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1305.52,637.46,0.0 +gfx950,256,4096,4096,1024,ck,0,0,45.9216,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,748.23,913.36,0.0 +gfx950,256,4096,4096,2048,cktile,11,0,76.3082,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,900.55,659.58,0.0 +gfx950,256,4096,4096,4096,cktile,11,0,115.8759,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1186.09,579.14,0.0 +gfx950,256,4096,4096,8192,cktile,11,0,199.5552,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1377.45,504.44,0.0 +gfx950,256,8192,4096,1024,cktile,11,0,80.1954,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,856.9,993.72,0.0 +gfx950,256,8192,4096,2048,cktile,11,0,116.4096,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1180.65,792.67,0.0 +gfx950,256,8192,4096,4096,cktile,11,0,185.5059,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1481.77,633.08,0.0 +gfx950,256,8192,4096,8192,cktile,28,0,291.6666,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1884.88,575.22,0.0 +gfx950,256,16384,4096,1024,cktile,11,0,154.0943,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,891.91,1007.11,0.0 +gfx950,256,16384,4096,2048,cktile,11,0,226.409,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1214.08,778.06,0.0 +gfx950,256,16384,4096,4096,cktile,11,0,356.4876,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1542.15,611.81,0.0 +gfx950,256,16384,4096,8192,cktile,27,0,577.0085,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1905.54,523.37,0.0 +gfx950,256,32768,4096,1024,cktile,11,0,285.7822,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,961.84,1071.39,0.0 +gfx950,256,32768,4096,2048,cktile,11,0,419.1778,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1311.51,820.49,0.0 +gfx950,256,32768,4096,4096,cktile,28,0,623.6529,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1763.02,672.54,0.0 +gfx950,256,32768,4096,8192,cktile,27,0,1065.5447,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,2063.76,535.34,0.0 +gfx950,256,1,4608,4096,ck,8,0,14.2605,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.65,1324.48,0.0 +gfx950,256,2,4608,4096,ck,8,0,14.3267,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.27,1319.28,0.0 +gfx950,256,4,4608,4096,ck,8,0,14.3329,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.53,1320.57,0.0 +gfx950,256,8,4608,4096,ck,8,0,14.3937,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.98,1318.69,0.0 +gfx950,256,16,4608,4096,ck,8,0,13.9524,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.29,1368.03,0.0 +gfx950,256,32,4608,4096,ck,8,0,14.0352,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,86.07,1375.14,0.0 +gfx950,256,64,4608,4096,ck,18,0,17.0643,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,141.58,1156.0,0.0 +gfx950,256,128,4608,4096,ck,18,0,17.5292,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,275.65,1173.94,0.0 +gfx950,256,256,4608,4096,ck,2,0,27.1182,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,356.35,821.67,0.0 +gfx950,256,512,4608,4096,ck,0,0,34.1289,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,566.3,752.74,0.0 +gfx950,256,1024,4608,4096,cktile,11,0,55.4868,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,696.65,585.83,0.0 +gfx950,256,2048,4608,4096,cktile,11,0,57.9299,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1334.53,796.43,0.0 +gfx950,256,4096,4608,4096,cktile,11,0,122.8584,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1258.51,597.44,0.0 +gfx950,256,8192,4608,4096,cktile,11,0,210.2934,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1470.51,608.32,0.0 +gfx950,256,16384,4608,4096,cktile,27,0,339.4407,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1822.04,698.14,0.0 +gfx950,256,32768,4608,4096,cktile,11,0,751.9367,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1645.02,605.21,0.0 +gfx950,256,1,9216,4096,ck,8,0,15.1096,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.0,2499.82,0.0 +gfx950,256,2,9216,4096,ck,8,0,15.1686,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.95,2491.58,0.0 +gfx950,256,4,9216,4096,ck,8,0,15.1452,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.94,2498.41,0.0 +gfx950,256,8,9216,4096,ck,8,0,15.1777,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,39.79,2498.99,0.0 +gfx950,256,16,9216,4096,ck,8,0,14.8504,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,81.34,2566.21,0.0 +gfx950,256,32,9216,4096,ck,7,0,18.1776,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,132.91,2116.32,0.0 +gfx950,256,64,9216,4096,ck,18,0,18.1853,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,265.7,2155.07,0.0 +gfx950,256,128,9216,4096,ck,2,0,28.0005,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,345.13,1451.13,0.0 +gfx950,256,256,9216,4096,ck,0,0,35.5904,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,543.05,1222.69,0.0 +gfx950,256,512,9216,4096,cktile,11,0,54.5189,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,709.01,903.96,0.0 +gfx950,256,1024,9216,4096,cktile,26,0,57.7557,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1338.56,1053.01,0.0 +gfx950,256,2048,9216,4096,cktile,11,0,120.3443,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1284.8,697.05,0.0 +gfx950,256,4096,9216,4096,cktile,11,0,203.3908,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1520.41,639.28,0.0 +gfx950,256,8192,9216,4096,cktile,27,0,340.9893,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1813.77,651.92,0.0 +gfx950,256,16384,9216,4096,cktile,28,0,658.8252,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1877.51,617.53,0.0 +gfx950,256,32768,9216,4096,cktile,28,0,1281.386,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1930.64,605.55,0.0 diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_5_397b_a13b.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_5_397b_a13b.csv index 73a8d323a2..163cd4caee 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_5_397b_a13b.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_5_397b_a13b.csv @@ -1,101 +1,101 @@ -cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,16,256,4096,ck,8,0,11.6965,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.18,90.04,0.0 -256,32,256,4096,ck,8,0,12.2103,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.12,94.93,0.0 -256,48,256,4096,ck,8,0,12.208,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.87,100.99,0.0 -256,64,256,4096,ck,8,0,12.2484,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.59,106.68,0.0 -256,768,256,4096,ck,8,0,12.895,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,124.9,355.76,0.0 -256,832,256,4096,ck,8,0,13.3192,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,128.64,361.38,0.0 -256,864,256,4096,ck,8,0,13.3665,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,130.85,365.96,0.0 -256,896,256,4096,ck,8,0,13.5159,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,134.21,372.49,0.0 -256,928,256,4096,ck,8,0,13.396,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,140.74,387.52,0.0 -256,960,256,4096,ck,8,0,13.584,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,143.58,392.67,0.0 -256,992,256,4096,ck,8,0,13.3313,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,151.33,411.17,0.0 -256,1024,256,4096,ck,8,0,13.7184,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,151.95,410.32,0.0 -256,1216,256,4096,ck,7,0,16.3449,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,153.71,401.9,0.0 -256,1280,256,4096,ck,18,0,15.9773,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,168.01,434.79,0.0 -256,1536,256,4096,ck,7,0,16.5728,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,191.96,485.07,0.0 -256,2048,256,4096,ck,7,0,16.9213,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,251.47,614.5,0.0 -256,4096,256,4096,ck,18,0,20.9363,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,408.19,946.98,0.0 -256,8192,256,4096,ck,2,0,27.4086,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,619.92,1400.38,0.0 -256,16384,256,4096,ck,0,0,38.0411,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,901.02,2007.35,0.0 -256,16,1024,4096,ck,8,0,13.1276,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.64,319.97,0.0 -256,32,1024,4096,ck,8,0,13.4521,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.97,322.76,0.0 -256,48,1024,4096,ck,8,0,13.2544,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.32,334.99,0.0 -256,64,1024,4096,ck,8,0,13.3221,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,35.26,340.66,0.0 -256,768,1024,4096,ck,18,0,16.4325,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,392.06,542.39,0.0 -256,832,1024,4096,ck,18,0,20.6277,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,332.25,446.68,0.0 -256,864,1024,4096,ck,18,0,20.7111,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,337.79,449.92,0.0 -256,896,1024,4096,ck,18,0,20.7092,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,350.38,459.16,0.0 -256,928,1024,4096,ck,18,0,20.7299,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,363.79,468.78,0.0 -256,960,1024,4096,ck,18,0,20.7661,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,375.68,477.13,0.0 -256,992,1024,4096,ck,18,0,20.9593,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,385.02,482.12,0.0 -256,1024,1024,4096,ck,18,0,20.8715,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,399.51,493.56,0.0 -256,1216,1024,4096,ck,2,0,25.8517,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,388.74,446.97,0.0 -256,1280,1024,4096,ck,3,0,24.8162,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,432.68,485.92,0.0 -256,1536,1024,4096,ck,2,0,26.1639,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,486.38,516.54,0.0 -256,2048,1024,4096,ck,2,0,26.2741,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,647.8,634.1,0.0 -256,4096,1024,4096,ck,0,0,36.2164,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,943.87,807.12,0.0 -256,8192,1024,4096,ck,0,0,61.7018,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1101.5,874.74,0.0 -256,16384,1024,4096,ck,0,0,115.4381,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1187.68,906.22,0.0 -256,16,1536,4096,ck,8,0,13.5333,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.93,465.42,0.0 -256,32,1536,4096,ck,8,0,13.9034,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.72,464.89,0.0 -256,48,1536,4096,ck,8,0,13.8133,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.44,476.22,0.0 -256,64,1536,4096,ck,8,0,13.7676,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,51.18,486.13,0.0 -256,768,1536,4096,ck,3,0,24.8201,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,389.35,475.28,0.0 -256,832,1536,4096,ck,2,0,25.7589,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,399.09,471.59,0.0 -256,864,1536,4096,ck,2,0,25.6811,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,408.63,477.77,0.0 -256,896,1536,4096,ck,2,0,25.8585,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,420.91,483.08,0.0 -256,928,1536,4096,ck,2,0,26.0381,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,434.44,489.11,0.0 -256,960,1536,4096,ck,2,0,25.9117,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,451.61,500.07,0.0 -256,992,1536,4096,ck,2,0,25.6287,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,472.31,514.54,0.0 -256,1024,1536,4096,ck,2,0,25.9951,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,481.15,516.11,0.0 -256,1216,1536,4096,ck,2,0,26.4181,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,570.61,563.2,0.0 -256,1280,1536,4096,ck,2,0,26.0995,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,617.1,592.6,0.0 -256,1536,1536,4096,ck,2,0,32.599,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,585.55,526.56,0.0 -256,2048,1536,4096,ck,0,0,33.4431,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,763.41,623.01,0.0 -256,4096,1536,4096,ck,0,0,53.8425,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,952.32,659.35,0.0 -256,8192,1536,4096,ck,0,0,92.8961,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1097.43,692.89,0.0 -256,16384,1536,4096,cktile,11,0,153.7171,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1337.88,803.07,0.0 -256,16,2560,4096,ck,8,0,13.4887,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.55,778.06,0.0 -256,32,2560,4096,ck,8,0,13.7977,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.48,775.99,0.0 -256,48,2560,4096,ck,8,0,13.8918,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,60.39,781.35,0.0 -256,64,2560,4096,ck,8,0,13.6447,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,86.07,806.31,0.0 -256,768,2560,4096,ck,2,0,25.9198,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,621.38,677.62,0.0 -256,832,2560,4096,ck,2,0,32.0203,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,535.09,562.62,0.0 -256,864,2560,4096,ck,2,0,32.4532,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,538.94,559.94,0.0 -256,896,2560,4096,ck,2,0,32.484,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,558.44,568.21,0.0 -256,928,2560,4096,ck,0,0,32.3089,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,583.54,580.98,0.0 -256,960,2560,4096,ck,0,0,32.3746,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,602.43,588.63,0.0 -256,992,2560,4096,ck,0,0,32.9496,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,612.29,587.31,0.0 -256,1024,2560,4096,ck,0,0,33.0949,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,629.88,593.64,0.0 -256,1216,2560,4096,ck,0,0,33.5298,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,749.3,642.01,0.0 -256,1280,2560,4096,ck,0,0,32.9542,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,814.57,676.16,0.0 -256,1536,2560,4096,ck,0,0,34.5836,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,919.91,707.46,0.0 -256,2048,2560,4096,ck,2,0,50.9098,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,835.82,573.27,0.0 -256,4096,2560,4096,cktile,11,0,61.9183,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1380.19,775.88,0.0 -256,8192,2560,4096,cktile,11,0,117.5766,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1445.11,724.24,0.0 -256,16384,2560,4096,cktile,11,0,226.3441,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1514.32,711.8,0.0 -256,16,4096,128,ck,8,0,2.8465,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.37,187.11,0.0 -256,32,4096,128,ck,11,0,3.0103,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_2x1_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,8.36,240.5,0.0 -256,48,4096,128,cktile,7,0,3.0027,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,13.97,285.44,0.0 -256,64,4096,128,ck,8,0,3.1557,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.61,313.78,0.0 -256,768,4096,128,cktile,6,0,5.9999,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,134.22,1152.36,0.0 -256,832,4096,128,cktile,7,0,6.809,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,125.82,1075.3,0.0 -256,864,4096,128,cktile,10,0,6.0445,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,144.68,1234.7,0.0 -256,896,4096,128,cktile,7,0,6.6084,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,137.25,1168.37,0.0 -256,928,4096,128,cktile,7,0,6.2697,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,150.35,1276.61,0.0 -256,960,4096,128,cktile,4,0,6.3954,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x128_intrawave_0x1x0_1,152.48,1291.85,0.0 -256,992,4096,128,cktile,10,0,6.2668,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,160.96,1360.84,0.0 -256,1024,4096,128,cktile,4,0,6.5738,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x128_intrawave_0x1x0_1,158.55,1337.79,0.0 -256,1216,4096,128,cktile,7,0,7.388,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,170.03,1420.09,0.0 -256,1280,4096,128,cktile,7,0,7.3956,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,181.48,1510.88,0.0 -256,1536,4096,128,cktile,10,0,7.9103,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,201.09,1661.85,0.0 -256,2048,4096,128,cktile,10,0,8.5809,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,247.94,2028.41,0.0 -256,4096,4096,128,cktile,10,0,12.0446,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,354.76,2858.4,0.0 -256,8192,4096,128,cktile,10,0,21.4001,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,396.99,3174.42,0.0 -256,16384,4096,128,cktile,10,0,39.0112,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,439.31,3499.16,0.0 -256,256,256,4096,ck,8,0,11.6735,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,45.99,190.88,0.0 -256,256,1024,4096,ck,8,0,13.1167,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,163.72,439.68,0.0 -256,256,1536,4096,ck,18,0,16.0602,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,200.57,506.0,0.0 -256,256,2560,4096,ck,18,0,16.2721,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,329.93,789.39,0.0 -256,256,4096,128,cktile,7,0,3.9178,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,68.52,677.47,0.0 +gfx,cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,16,256,4096,ck,8,0,11.6965,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.18,90.04,0.0 +gfx950,256,32,256,4096,ck,8,0,12.2103,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.12,94.93,0.0 +gfx950,256,48,256,4096,ck,8,0,12.208,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.87,100.99,0.0 +gfx950,256,64,256,4096,ck,8,0,12.2484,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.59,106.68,0.0 +gfx950,256,768,256,4096,ck,8,0,12.895,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,124.9,355.76,0.0 +gfx950,256,832,256,4096,ck,8,0,13.3192,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,128.64,361.38,0.0 +gfx950,256,864,256,4096,ck,8,0,13.3665,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,130.85,365.96,0.0 +gfx950,256,896,256,4096,ck,8,0,13.5159,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,134.21,372.49,0.0 +gfx950,256,928,256,4096,ck,8,0,13.396,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,140.74,387.52,0.0 +gfx950,256,960,256,4096,ck,8,0,13.584,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,143.58,392.67,0.0 +gfx950,256,992,256,4096,ck,8,0,13.3313,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,151.33,411.17,0.0 +gfx950,256,1024,256,4096,ck,8,0,13.7184,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,151.95,410.32,0.0 +gfx950,256,1216,256,4096,ck,7,0,16.3449,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,153.71,401.9,0.0 +gfx950,256,1280,256,4096,ck,18,0,15.9773,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,168.01,434.79,0.0 +gfx950,256,1536,256,4096,ck,7,0,16.5728,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,191.96,485.07,0.0 +gfx950,256,2048,256,4096,ck,7,0,16.9213,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,251.47,614.5,0.0 +gfx950,256,4096,256,4096,ck,18,0,20.9363,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,408.19,946.98,0.0 +gfx950,256,8192,256,4096,ck,2,0,27.4086,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,619.92,1400.38,0.0 +gfx950,256,16384,256,4096,ck,0,0,38.0411,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,901.02,2007.35,0.0 +gfx950,256,16,1024,4096,ck,8,0,13.1276,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.64,319.97,0.0 +gfx950,256,32,1024,4096,ck,8,0,13.4521,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.97,322.76,0.0 +gfx950,256,48,1024,4096,ck,8,0,13.2544,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.32,334.99,0.0 +gfx950,256,64,1024,4096,ck,8,0,13.3221,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,35.26,340.66,0.0 +gfx950,256,768,1024,4096,ck,18,0,16.4325,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,392.06,542.39,0.0 +gfx950,256,832,1024,4096,ck,18,0,20.6277,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,332.25,446.68,0.0 +gfx950,256,864,1024,4096,ck,18,0,20.7111,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,337.79,449.92,0.0 +gfx950,256,896,1024,4096,ck,18,0,20.7092,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,350.38,459.16,0.0 +gfx950,256,928,1024,4096,ck,18,0,20.7299,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,363.79,468.78,0.0 +gfx950,256,960,1024,4096,ck,18,0,20.7661,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,375.68,477.13,0.0 +gfx950,256,992,1024,4096,ck,18,0,20.9593,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,385.02,482.12,0.0 +gfx950,256,1024,1024,4096,ck,18,0,20.8715,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,399.51,493.56,0.0 +gfx950,256,1216,1024,4096,ck,2,0,25.8517,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,388.74,446.97,0.0 +gfx950,256,1280,1024,4096,ck,3,0,24.8162,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,432.68,485.92,0.0 +gfx950,256,1536,1024,4096,ck,2,0,26.1639,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,486.38,516.54,0.0 +gfx950,256,2048,1024,4096,ck,2,0,26.2741,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,647.8,634.1,0.0 +gfx950,256,4096,1024,4096,ck,0,0,36.2164,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,943.87,807.12,0.0 +gfx950,256,8192,1024,4096,ck,0,0,61.7018,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1101.5,874.74,0.0 +gfx950,256,16384,1024,4096,ck,0,0,115.4381,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1187.68,906.22,0.0 +gfx950,256,16,1536,4096,ck,8,0,13.5333,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.93,465.42,0.0 +gfx950,256,32,1536,4096,ck,8,0,13.9034,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,21.72,464.89,0.0 +gfx950,256,48,1536,4096,ck,8,0,13.8133,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.44,476.22,0.0 +gfx950,256,64,1536,4096,ck,8,0,13.7676,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,51.18,486.13,0.0 +gfx950,256,768,1536,4096,ck,3,0,24.8201,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,389.35,475.28,0.0 +gfx950,256,832,1536,4096,ck,2,0,25.7589,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,399.09,471.59,0.0 +gfx950,256,864,1536,4096,ck,2,0,25.6811,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,408.63,477.77,0.0 +gfx950,256,896,1536,4096,ck,2,0,25.8585,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,420.91,483.08,0.0 +gfx950,256,928,1536,4096,ck,2,0,26.0381,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,434.44,489.11,0.0 +gfx950,256,960,1536,4096,ck,2,0,25.9117,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,451.61,500.07,0.0 +gfx950,256,992,1536,4096,ck,2,0,25.6287,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,472.31,514.54,0.0 +gfx950,256,1024,1536,4096,ck,2,0,25.9951,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,481.15,516.11,0.0 +gfx950,256,1216,1536,4096,ck,2,0,26.4181,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,570.61,563.2,0.0 +gfx950,256,1280,1536,4096,ck,2,0,26.0995,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,617.1,592.6,0.0 +gfx950,256,1536,1536,4096,ck,2,0,32.599,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,585.55,526.56,0.0 +gfx950,256,2048,1536,4096,ck,0,0,33.4431,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,763.41,623.01,0.0 +gfx950,256,4096,1536,4096,ck,0,0,53.8425,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,952.32,659.35,0.0 +gfx950,256,8192,1536,4096,ck,0,0,92.8961,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1097.43,692.89,0.0 +gfx950,256,16384,1536,4096,cktile,11,0,153.7171,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1337.88,803.07,0.0 +gfx950,256,16,2560,4096,ck,8,0,13.4887,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.55,778.06,0.0 +gfx950,256,32,2560,4096,ck,8,0,13.7977,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.48,775.99,0.0 +gfx950,256,48,2560,4096,ck,8,0,13.8918,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,60.39,781.35,0.0 +gfx950,256,64,2560,4096,ck,8,0,13.6447,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,86.07,806.31,0.0 +gfx950,256,768,2560,4096,ck,2,0,25.9198,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,621.38,677.62,0.0 +gfx950,256,832,2560,4096,ck,2,0,32.0203,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,535.09,562.62,0.0 +gfx950,256,864,2560,4096,ck,2,0,32.4532,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,538.94,559.94,0.0 +gfx950,256,896,2560,4096,ck,2,0,32.484,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,558.44,568.21,0.0 +gfx950,256,928,2560,4096,ck,0,0,32.3089,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,583.54,580.98,0.0 +gfx950,256,960,2560,4096,ck,0,0,32.3746,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,602.43,588.63,0.0 +gfx950,256,992,2560,4096,ck,0,0,32.9496,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,612.29,587.31,0.0 +gfx950,256,1024,2560,4096,ck,0,0,33.0949,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,629.88,593.64,0.0 +gfx950,256,1216,2560,4096,ck,0,0,33.5298,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,749.3,642.01,0.0 +gfx950,256,1280,2560,4096,ck,0,0,32.9542,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,814.57,676.16,0.0 +gfx950,256,1536,2560,4096,ck,0,0,34.5836,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,919.91,707.46,0.0 +gfx950,256,2048,2560,4096,ck,2,0,50.9098,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,835.82,573.27,0.0 +gfx950,256,4096,2560,4096,cktile,11,0,61.9183,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1380.19,775.88,0.0 +gfx950,256,8192,2560,4096,cktile,11,0,117.5766,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1445.11,724.24,0.0 +gfx950,256,16384,2560,4096,cktile,11,0,226.3441,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1514.32,711.8,0.0 +gfx950,256,16,4096,128,ck,8,0,2.8465,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.37,187.11,0.0 +gfx950,256,32,4096,128,ck,11,0,3.0103,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_2x1_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,8.36,240.5,0.0 +gfx950,256,48,4096,128,cktile,7,0,3.0027,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,13.97,285.44,0.0 +gfx950,256,64,4096,128,ck,8,0,3.1557,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.61,313.78,0.0 +gfx950,256,768,4096,128,cktile,6,0,5.9999,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,134.22,1152.36,0.0 +gfx950,256,832,4096,128,cktile,7,0,6.809,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,125.82,1075.3,0.0 +gfx950,256,864,4096,128,cktile,10,0,6.0445,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,144.68,1234.7,0.0 +gfx950,256,896,4096,128,cktile,7,0,6.6084,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,137.25,1168.37,0.0 +gfx950,256,928,4096,128,cktile,7,0,6.2697,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,150.35,1276.61,0.0 +gfx950,256,960,4096,128,cktile,4,0,6.3954,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x128_intrawave_0x1x0_1,152.48,1291.85,0.0 +gfx950,256,992,4096,128,cktile,10,0,6.2668,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,160.96,1360.84,0.0 +gfx950,256,1024,4096,128,cktile,4,0,6.5738,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x128_intrawave_0x1x0_1,158.55,1337.79,0.0 +gfx950,256,1216,4096,128,cktile,7,0,7.388,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,170.03,1420.09,0.0 +gfx950,256,1280,4096,128,cktile,7,0,7.3956,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,181.48,1510.88,0.0 +gfx950,256,1536,4096,128,cktile,10,0,7.9103,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,201.09,1661.85,0.0 +gfx950,256,2048,4096,128,cktile,10,0,8.5809,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,247.94,2028.41,0.0 +gfx950,256,4096,4096,128,cktile,10,0,12.0446,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,354.76,2858.4,0.0 +gfx950,256,8192,4096,128,cktile,10,0,21.4001,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,396.99,3174.42,0.0 +gfx950,256,16384,4096,128,cktile,10,0,39.0112,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,439.31,3499.16,0.0 +gfx950,256,256,256,4096,ck,8,0,11.6735,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,45.99,190.88,0.0 +gfx950,256,256,1024,4096,ck,8,0,13.1167,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,163.72,439.68,0.0 +gfx950,256,256,1536,4096,ck,18,0,16.0602,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,200.57,506.0,0.0 +gfx950,256,256,2560,4096,ck,18,0,16.2721,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,329.93,789.39,0.0 +gfx950,256,256,4096,128,cktile,7,0,3.9178,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,68.52,677.47,0.0 diff --git a/aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv b/aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv index ee27f2cd82..c85a88a15c 100644 --- a/aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv +++ b/aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv @@ -1,1490 +1,1490 @@ -cu_num,M,N,K,q_dtype_w,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -80,1,64,7168,torch.float8_e4m3fnuz,ck,10,0,10.109,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.09,46.1,0.0 -80,2,64,7168,torch.float8_e4m3fnuz,ck,10,0,10.1816,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.18,46.49,0.0 -80,4,64,7168,torch.float8_e4m3fnuz,ck,10,0,10.1737,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.36,47.96,0.0 -80,8,64,7168,torch.float8_e4m3fnuz,ck,10,0,10.4092,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.71,49.68,0.0 -80,16,64,7168,torch.float8_e4m3fnuz,ck,24,0,8.826,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,1.66,65.2,0.0 -80,32,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.4134,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,3.12,73.54,0.0 -80,64,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.3656,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,6.27,98.84,0.0 -80,96,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.3563,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,9.41,123.89,0.0 -80,128,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.4481,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,12.43,147.4,0.0 -80,160,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.4711,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,15.5,171.69,0.0 -80,192,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.4734,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,18.6,196.3,0.0 -80,224,64,7168,torch.float8_e4m3fnuz,ck,10,0,9.6924,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,21.2,215.95,0.0 -80,256,64,7168,torch.float8_e4m3fnuz,ck,10,0,9.9464,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,23.61,233.91,0.0 -80,288,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.0283,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,26.35,255.28,0.0 -80,320,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.6767,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,27.5,261.64,0.0 -80,352,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.7355,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,30.08,281.96,0.0 -80,384,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.7324,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,32.83,303.79,0.0 -80,416,64,7168,torch.float8_e4m3fnuz,ck,10,0,10.8957,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,35.03,320.67,0.0 -80,448,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.9127,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,37.67,341.56,0.0 -80,480,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.9898,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,40.07,360.41,0.0 -80,512,64,7168,torch.float8_e4m3fnuz,ck,24,0,11.1383,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,42.18,376.57,0.0 -80,1024,64,7168,torch.float8_e4m3fnuz,ck,5,0,12.4536,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,75.44,636.75,0.0 -80,2048,64,7168,torch.float8_e4m3fnuz,ck,12,0,18.011,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,104.33,855.09,0.0 -80,4096,64,7168,torch.float8_e4m3fnuz,ck,114,0,24.9271,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,150.76,1217.28,0.0 -80,6144,64,7168,torch.float8_e4m3fnuz,ck,115,0,32.8306,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,171.7,1379.36,0.0 -80,8192,64,7168,torch.float8_e4m3fnuz,ck,114,0,41.1456,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,182.67,1463.77,0.0 -80,10240,64,7168,torch.float8_e4m3fnuz,ck,144,0,42.9825,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,218.58,1748.85,0.0 -80,12288,64,7168,torch.float8_e4m3fnuz,ck,114,0,61.5548,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,183.16,1463.93,0.0 -80,14336,64,7168,torch.float8_e4m3fnuz,ck,114,0,62.6178,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,210.06,1677.71,0.0 -80,16384,64,7168,torch.float8_e4m3fnuz,ck,114,0,78.1928,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,192.25,1534.62,0.0 -80,32768,64,7168,torch.float8_e4m3fnuz,ck,114,0,137.0183,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,219.42,1748.19,0.0 -80,65536,64,7168,torch.float8_e4m3fnuz,ck,114,0,263.1602,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,228.49,1818.7,0.0 -80,98304,64,7168,torch.float8_e4m3fnuz,ck,114,0,397.9789,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,226.63,1803.32,0.0 -80,1,128,7168,torch.float8_e4m3fnuz,ck,25,0,10.735,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,2,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.1434,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.0606,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.4198,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,16,128,7168,torch.float8_e4m3fnuz,ck,19,0,10.0946,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,32,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.0298,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,64,128,7168,torch.float8_e4m3fnuz,ck,24,0,9.9198,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,96,128,7168,torch.float8_e4m3fnuz,ck,25,0,10.3566,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,128,128,7168,torch.float8_e4m3fnuz,ck,10,0,9.909,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,160,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.2346,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,192,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.2034,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,224,128,7168,torch.float8_e4m3fnuz,ck,24,0,10.2858,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,256,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.8778,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,288,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.7298,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,320,128,7168,torch.float8_e4m3fnuz,ck,24,0,10.9094,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,352,128,7168,torch.float8_e4m3fnuz,ck,10,0,11.2254,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,384,128,7168,torch.float8_e4m3fnuz,ck,10,0,11.3822,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,416,128,7168,torch.float8_e4m3fnuz,ck,24,0,11.6798,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,448,128,7168,torch.float8_e4m3fnuz,ck,10,0,11.523,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,480,128,7168,torch.float8_e4m3fnuz,ck,10,0,11.607,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,512,128,7168,torch.float8_e4m3fnuz,ck,11,0,12.1131,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,1024,128,7168,torch.float8_e4m3fnuz,ck,6,0,17.8979,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,2048,128,7168,torch.float8_e4m3fnuz,ck,119,0,24.0187,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,4096,128,7168,torch.float8_e4m3fnuz,ck,121,0,38.9415,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,6144,128,7168,torch.float8_e4m3fnuz,ck,123,0,49.1848,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,8192,128,7168,torch.float8_e4m3fnuz,ck,124,0,61.8924,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,10240,128,7168,torch.float8_e4m3fnuz,ck,121,0,63.2764,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,12288,128,7168,torch.float8_e4m3fnuz,ck,141,0,84.5069,a8w8_bpreshuffle_256x160x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,14336,128,7168,torch.float8_e4m3fnuz,ck,86,0,91.5501,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,128,7168,torch.float8_e4m3fnuz,ck,0,0,114.6838,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,32768,128,7168,torch.float8_e4m3fnuz,ck,121,0,208.4886,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,65536,128,7168,torch.float8_e4m3fnuz,ck,121,0,383.2232,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,98304,128,7168,torch.float8_e4m3fnuz,ck,87,0,561.5334,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.0,0.0,0.0 -80,131072,128,7168,torch.float8_e4m3fnuz,ck,0,0,719.3472,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.212,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.36,180.44,0.0 -80,2,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.52,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.7,175.89,0.0 -80,4,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.571,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.39,176.49,0.0 -80,8,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.72,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.74,176.91,0.0 -80,16,256,7168,torch.float8_e4m3fnuz,ck,24,0,9.298,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,6.32,210.57,0.0 -80,32,256,7168,torch.float8_e4m3fnuz,ck,24,0,9.6449,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,12.18,215.74,0.0 -80,64,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.023,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,23.43,232.12,0.0 -80,96,256,7168,torch.float8_e4m3fnuz,ck,24,0,10.2032,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,34.53,252.11,0.0 -80,128,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.6508,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,44.11,264.59,0.0 -80,160,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.9094,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,53.83,280.84,0.0 -80,192,256,7168,torch.float8_e4m3fnuz,ck,11,0,11.7012,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,60.22,282.84,0.0 -80,224,256,7168,torch.float8_e4m3fnuz,ck,10,0,11.9205,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,68.96,298.25,0.0 -80,256,256,7168,torch.float8_e4m3fnuz,ck,5,0,11.9933,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,78.34,316.93,0.0 -80,288,256,7168,torch.float8_e4m3fnuz,ck,25,0,11.9972,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,88.1,337.32,0.0 -80,320,256,7168,torch.float8_e4m3fnuz,ck,11,0,12.2086,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,96.19,351.61,0.0 -80,352,256,7168,torch.float8_e4m3fnuz,ck,10,0,16.2524,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,79.49,279.24,0.0 -80,384,256,7168,torch.float8_e4m3fnuz,ck,24,0,16.3243,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,86.33,293.07,0.0 -80,416,256,7168,torch.float8_e4m3fnuz,ck,10,0,16.4786,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,92.65,305.24,0.0 -80,448,256,7168,torch.float8_e4m3fnuz,ck,24,0,16.7118,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,98.38,315.68,0.0 -80,480,256,7168,torch.float8_e4m3fnuz,ck,19,0,17.129,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,102.84,322.34,0.0 -80,512,256,7168,torch.float8_e4m3fnuz,ck,6,0,17.9831,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,104.49,320.7,0.0 -80,1024,256,7168,torch.float8_e4m3fnuz,ck,119,0,23.9594,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,156.85,404.82,0.0 -80,2048,256,7168,torch.float8_e4m3fnuz,ck,114,0,37.6676,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,199.54,466.28,0.0 -80,4096,256,7168,torch.float8_e4m3fnuz,ck,121,0,63.1504,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,238.04,527.19,0.0 -80,6144,256,7168,torch.float8_e4m3fnuz,ck,136,0,77.3912,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,291.36,633.42,0.0 -80,8192,256,7168,torch.float8_e4m3fnuz,ck,138,0,106.0081,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,283.61,610.8,0.0 -80,10240,256,7168,torch.float8_e4m3fnuz,ck,0,0,113.4228,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,331.34,709.54,0.0 -80,12288,256,7168,torch.float8_e4m3fnuz,ck,136,0,152.8106,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,295.12,629.58,0.0 -80,14336,256,7168,torch.float8_e4m3fnuz,ck,102,0,161.1134,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,326.56,694.76,0.0 -80,16384,256,7168,torch.float8_e4m3fnuz,ck,85,0,207.4081,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,289.91,615.52,0.0 -80,32768,256,7168,torch.float8_e4m3fnuz,ck,0,0,379.1653,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,317.17,668.56,0.0 -80,65536,256,7168,torch.float8_e4m3fnuz,ck,102,0,699.9897,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,343.6,721.66,0.0 -80,98304,256,7168,torch.float8_e4m3fnuz,ck,102,0,973.4779,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,370.61,777.43,0.0 -80,131072,256,7168,torch.float8_e4m3fnuz,ck,102,0,1385.804,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,347.12,727.71,0.0 -80,1,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.7317,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.63,313.53,0.0 -80,2,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.2907,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.6471,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.7575,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,16,512,7168,torch.float8_e4m3fnuz,ck,24,0,10.7923,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,108.82,352.2,0.0 -80,32,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.1022,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,211.56,354.18,0.0 -80,48,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.2279,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,313.79,361.89,0.0 -80,64,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.1263,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,422.21,376.97,0.0 -80,80,512,7168,torch.float8_e4m3fnuz,ck,24,0,11.2303,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,522.87,385.15,0.0 -80,96,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.6635,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,604.14,382.08,0.0 -80,112,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.9903,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,685.62,382.6,0.0 -80,128,512,7168,torch.float8_e4m3fnuz,ck,19,0,12.3367,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,761.57,382.48,0.0 -80,160,512,7168,torch.float8_e4m3fnuz,ck,25,0,12.1467,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,192,512,7168,torch.float8_e4m3fnuz,ck,10,0,16.0911,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,224,512,7168,torch.float8_e4m3fnuz,ck,19,0,17.1051,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,256,512,7168,torch.float8_e4m3fnuz,ck,6,0,17.5859,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1068.5,327.94,0.0 -80,288,512,7168,torch.float8_e4m3fnuz,ck,6,0,17.6475,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,320,512,7168,torch.float8_e4m3fnuz,ck,6,0,17.8315,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,352,512,7168,torch.float8_e4m3fnuz,ck,112,0,22.2763,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,384,512,7168,torch.float8_e4m3fnuz,ck,113,0,21.2655,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,416,512,7168,torch.float8_e4m3fnuz,ck,112,0,22.5591,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,448,512,7168,torch.float8_e4m3fnuz,ck,112,0,22.7188,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,480,512,7168,torch.float8_e4m3fnuz,ck,113,0,21.5703,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,512,512,7168,torch.float8_e4m3fnuz,ck,119,0,23.6639,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1588.11,332.33,0.0 -80,1024,512,7168,torch.float8_e4m3fnuz,ck,114,0,36.6616,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2050.15,328.92,0.0 -80,1536,512,7168,torch.float8_e4m3fnuz,ck,120,0,48.0728,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2345.25,338.09,0.0 -80,2048,512,7168,torch.float8_e4m3fnuz,ck,85,0,58.1113,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2586.83,351.86,0.0 -80,4096,512,7168,torch.float8_e4m3fnuz,ck,85,0,101.7067,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2956.03,366.0,0.0 -80,6144,512,7168,torch.float8_e4m3fnuz,ck,85,0,143.4871,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,512,7168,torch.float8_e4m3fnuz,ck,85,0,195.7643,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3071.53,361.55,0.0 -80,10240,512,7168,torch.float8_e4m3fnuz,ck,0,0,211.3264,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,512,7168,torch.float8_e4m3fnuz,ck,0,0,262.6171,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,512,7168,torch.float8_e4m3fnuz,ck,102,0,306.9194,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,512,7168,torch.float8_e4m3fnuz,ck,85,0,354.1062,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3396.13,389.4,0.0 -80,20480,512,7168,torch.float8_e4m3fnuz,ck,70,0,411.9813,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3648.8,416.14,0.0 -80,32768,512,7168,torch.float8_e4m3fnuz,ck,70,0,664.7692,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3618.07,409.32,0.0 -80,65536,512,7168,torch.float8_e4m3fnuz,ck,70,0,1327.0618,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,512,7168,torch.float8_e4m3fnuz,ck,102,0,1866.4062,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,512,7168,torch.float8_e4m3fnuz,ck,102,0,2615.157,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1,576,7168,torch.float8_e4m3fnuz,ck,10,0,11.2398,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,2,576,7168,torch.float8_e4m3fnuz,ck,10,0,11.5102,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,576,7168,torch.float8_e4m3fnuz,ck,10,0,11.6411,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,576,7168,torch.float8_e4m3fnuz,ck,10,0,11.871,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,16,576,7168,torch.float8_e4m3fnuz,ck,10,0,10.7303,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,12.31,397.18,0.0 -80,32,576,7168,torch.float8_e4m3fnuz,ck,10,0,11.199,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,235.95,392.45,0.0 -80,64,576,7168,torch.float8_e4m3fnuz,ck,24,0,11.163,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,473.42,417.56,0.0 -80,96,576,7168,torch.float8_e4m3fnuz,ck,10,0,12.065,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,128,576,7168,torch.float8_e4m3fnuz,ck,25,0,12.3755,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,854.08,419.68,0.0 -80,160,576,7168,torch.float8_e4m3fnuz,ck,10,0,16.0783,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,192,576,7168,torch.float8_e4m3fnuz,ck,10,0,17.0079,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,224,576,7168,torch.float8_e4m3fnuz,ck,25,0,17.1379,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,256,576,7168,torch.float8_e4m3fnuz,ck,112,0,18.3779,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1150.26,340.56,0.0 -80,288,576,7168,torch.float8_e4m3fnuz,ck,113,0,21.2123,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,320,576,7168,torch.float8_e4m3fnuz,ck,112,0,22.5067,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,352,576,7168,torch.float8_e4m3fnuz,ck,112,0,22.5607,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,384,576,7168,torch.float8_e4m3fnuz,ck,113,0,21.2619,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,416,576,7168,torch.float8_e4m3fnuz,ck,112,0,22.8351,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,448,576,7168,torch.float8_e4m3fnuz,ck,112,0,22.8423,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,480,576,7168,torch.float8_e4m3fnuz,ck,112,0,24.5819,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,512,576,7168,torch.float8_e4m3fnuz,ck,114,0,23.9199,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1767.51,350.7,0.0 -80,1024,576,7168,torch.float8_e4m3fnuz,ck,114,0,37.1872,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2273.82,340.13,0.0 -80,1536,576,7168,torch.float8_e4m3fnuz,ck,128,0,47.2304,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2685.47,358.0,0.0 -80,2048,576,7168,torch.float8_e4m3fnuz,ck,129,0,60.7837,a8w8_bpreshuffle_256x80x192x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2782.23,348.25,0.0 -80,4096,576,7168,torch.float8_e4m3fnuz,ck,93,0,113.3711,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2983.38,337.01,0.0 -80,6144,576,7168,torch.float8_e4m3fnuz,ck,68,0,151.638,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,576,7168,torch.float8_e4m3fnuz,ck,93,0,191.4823,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3532.74,377.51,0.0 -80,10240,576,7168,torch.float8_e4m3fnuz,ck,68,0,230.3376,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,576,7168,torch.float8_e4m3fnuz,ck,94,0,285.6522,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,576,7168,torch.float8_e4m3fnuz,ck,93,0,341.468,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,576,7168,torch.float8_e4m3fnuz,ck,68,0,372.1747,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3635.16,377.36,0.0 -80,20480,576,7168,torch.float8_e4m3fnuz,ck,68,0,439.759,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3845.61,396.86,0.0 -80,32768,576,7168,torch.float8_e4m3fnuz,ck,68,0,729.7961,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,576,7168,torch.float8_e4m3fnuz,ck,93,0,1429.9239,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,576,7168,torch.float8_e4m3fnuz,ck,95,0,2112.9835,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.0,0.0,0.0 -80,131072,576,7168,torch.float8_e4m3fnuz,ck,93,0,2822.5197,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1,1024,7168,torch.float8_e4m3fnuz,ck,10,0,12.1282,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.21,605.96,0.0 -80,2,1024,7168,torch.float8_e4m3fnuz,ck,10,0,12.0277,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.44,611.79,0.0 -80,4,1024,7168,torch.float8_e4m3fnuz,ck,10,0,12.1184,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,4.85,608.74,0.0 -80,8,1024,7168,torch.float8_e4m3fnuz,ck,10,0,12.2066,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.62,607.36,0.0 -80,16,1024,7168,torch.float8_e4m3fnuz,ck,10,0,11.306,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,20.77,662.26,0.0 -80,32,1024,7168,torch.float8_e4m3fnuz,ck,10,0,11.4004,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,41.21,669.71,0.0 -80,64,1024,7168,torch.float8_e4m3fnuz,ck,11,0,12.7183,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,73.87,623.5,0.0 -80,96,1024,7168,torch.float8_e4m3fnuz,ck,19,0,17.1761,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,82.05,478.85,0.0 -80,128,1024,7168,torch.float8_e4m3fnuz,ck,6,0,18.5394,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,101.35,459.54,0.0 -80,160,1024,7168,torch.float8_e4m3fnuz,ck,6,0,18.3958,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,127.68,479.16,0.0 -80,192,1024,7168,torch.float8_e4m3fnuz,ck,113,0,21.6602,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,130.13,420.56,0.0 -80,224,1024,7168,torch.float8_e4m3fnuz,ck,113,0,24.174,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,136.03,389.03,0.0 -80,256,1024,7168,torch.float8_e4m3fnuz,ck,114,0,24.24,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,155.04,400.14,0.0 -80,288,1024,7168,torch.float8_e4m3fnuz,ck,119,0,24.2463,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,174.37,412.2,0.0 -80,320,1024,7168,torch.float8_e4m3fnuz,ck,114,0,24.291,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,193.39,423.58,0.0 -80,352,1024,7168,torch.float8_e4m3fnuz,ck,120,0,31.8392,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,162.3,332.42,0.0 -80,384,1024,7168,torch.float8_e4m3fnuz,ck,120,0,29.8956,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,188.56,363.9,0.0 -80,416,1024,7168,torch.float8_e4m3fnuz,ck,120,0,31.629,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,193.08,353.28,0.0 -80,448,1024,7168,torch.float8_e4m3fnuz,ck,120,0,31.7988,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,206.82,360.67,0.0 -80,480,1024,7168,torch.float8_e4m3fnuz,ck,120,0,29.9502,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,235.27,392.78,0.0 -80,512,1024,7168,torch.float8_e4m3fnuz,ck,119,0,37.7242,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,199.24,319.65,0.0 -80,1024,1024,7168,torch.float8_e4m3fnuz,ck,85,0,59.714,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,251.74,280.96,0.0 -80,2048,1024,7168,torch.float8_e4m3fnuz,ck,85,0,105.1832,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,285.83,249.23,0.0 -80,4096,1024,7168,torch.float8_e4m3fnuz,ck,85,0,187.4655,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.75,240.52,0.0 -80,6144,1024,7168,torch.float8_e4m3fnuz,ck,85,0,267.9814,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,336.57,238.68,0.0 -80,8192,1024,7168,torch.float8_e4m3fnuz,ck,85,0,349.6474,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,343.94,236.92,0.0 -80,10240,1024,7168,torch.float8_e4m3fnuz,ck,70,0,424.2224,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,354.35,239.76,0.0 -80,12288,1024,7168,torch.float8_e4m3fnuz,ck,85,0,519.3639,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,347.33,232.18,0.0 -80,14336,1024,7168,torch.float8_e4m3fnuz,ck,85,0,598.3138,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,351.74,233.09,0.0 -80,16384,1024,7168,torch.float8_e4m3fnuz,ck,70,0,681.6293,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,352.86,232.29,0.0 -80,32768,1024,7168,torch.float8_e4m3fnuz,ck,85,0,1338.2851,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,359.44,231.14,0.0 -80,65536,1024,7168,torch.float8_e4m3fnuz,ck,85,0,2666.7419,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,360.77,229.24,0.0 -80,98304,1024,7168,torch.float8_e4m3fnuz,ck,102,0,3838.9675,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,375.91,237.91,0.0 -80,1,1280,8192,torch.float8_e4m3fnuz,ck,10,0,13.1036,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.6,801.04,0.0 -80,32,1280,8192,torch.float8_e4m3fnuz,ck,10,0,12.0442,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,55.72,899.17,0.0 -80,64,1280,8192,torch.float8_e4m3fnuz,ck,19,0,13.7807,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,973.95,810.84,0.0 -80,128,1280,8192,torch.float8_e4m3fnuz,ck,6,0,20.4871,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1310.27,579.0,0.0 -80,192,1280,8192,torch.float8_e4m3fnuz,ck,113,0,23.6583,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,170.2,530.48,0.0 -80,256,1280,8192,torch.float8_e4m3fnuz,ck,114,0,26.0071,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,206.43,509.03,0.0 -80,320,1280,8192,torch.float8_e4m3fnuz,ck,115,0,32.2959,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2077.94,431.21,0.0 -80,512,1280,8192,torch.float8_e4m3fnuz,ck,114,0,42.3216,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2537.1,377.84,0.0 -80,1024,1280,8192,torch.float8_e4m3fnuz,ck,85,0,68.1209,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3152.46,315.55,0.0 -80,2048,1280,8192,torch.float8_e4m3fnuz,ck,0,0,119.2848,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3600.6,272.51,0.0 -80,4096,1280,8192,torch.float8_e4m3fnuz,ck,71,0,234.5689,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3662.01,232.45,0.0 -80,8192,1280,8192,torch.float8_e4m3fnuz,ck,71,0,452.4299,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3797.24,217.86,0.0 -80,16384,1280,8192,torch.float8_e4m3fnuz,ck,71,0,902.6619,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3806.49,206.77,0.0 -80,1,1536,7168,torch.float8_e4m3fnuz,ck,10,0,11.721,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,2,1536,7168,torch.float8_e4m3fnuz,ck,10,0,12.1679,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,1536,7168,torch.float8_e4m3fnuz,ck,10,0,12.2579,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,1536,7168,torch.float8_e4m3fnuz,ck,10,0,12.559,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,16,1536,7168,torch.float8_e4m3fnuz,ck,10,0,10.6831,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,32.98,1045.94,0.0 -80,32,1536,7168,torch.float8_e4m3fnuz,ck,11,0,12.4632,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,56.54,909.7,0.0 -80,64,1536,7168,torch.float8_e4m3fnuz,ck,19,0,16.9363,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,832.11,688.78,0.0 -80,96,1536,7168,torch.float8_e4m3fnuz,ck,6,0,18.0262,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,128,1536,7168,torch.float8_e4m3fnuz,ck,112,0,22.8955,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1231.06,538.13,0.0 -80,160,1536,7168,torch.float8_e4m3fnuz,ck,119,0,24.4126,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,192,1536,7168,torch.float8_e4m3fnuz,ck,114,0,24.0194,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,224,1536,7168,torch.float8_e4m3fnuz,ck,115,0,31.7078,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.0,0.0,0.0 -80,256,1536,7168,torch.float8_e4m3fnuz,ck,120,0,31.1851,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1807.64,437.12,0.0 -80,288,1536,7168,torch.float8_e4m3fnuz,ck,120,0,29.6846,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,320,1536,7168,torch.float8_e4m3fnuz,ck,126,0,32.1157,a8w8_bpreshuffle_256x32x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,352,1536,7168,torch.float8_e4m3fnuz,ck,133,0,39.9737,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,384,1536,7168,torch.float8_e4m3fnuz,ck,114,0,37.6905,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,1536,7168,torch.float8_e4m3fnuz,ck,119,0,38.3401,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,448,1536,7168,torch.float8_e4m3fnuz,ck,113,0,46.246,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,480,1536,7168,torch.float8_e4m3fnuz,ck,113,0,45.4048,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,512,1536,7168,torch.float8_e4m3fnuz,ck,128,0,47.65,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2366.06,341.09,0.0 -80,1024,1536,7168,torch.float8_e4m3fnuz,ck,136,0,74.0965,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,3043.14,290.11,0.0 -80,1536,1536,7168,torch.float8_e4m3fnuz,ck,0,0,105.4423,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3207.71,253.59,0.0 -80,2048,1536,7168,torch.float8_e4m3fnuz,ck,85,0,138.8692,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3247.46,230.3,0.0 -80,4096,1536,7168,torch.float8_e4m3fnuz,ck,85,0,256.917,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3510.64,206.11,0.0 -80,6144,1536,7168,torch.float8_e4m3fnuz,ck,93,0,368.8956,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,1536,7168,torch.float8_e4m3fnuz,ck,72,0,501.1581,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,3599.44,189.35,0.0 -80,10240,1536,7168,torch.float8_e4m3fnuz,ck,68,0,583.4998,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,1536,7168,torch.float8_e4m3fnuz,ck,94,0,718.7933,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,1536,7168,torch.float8_e4m3fnuz,ck,93,0,830.4984,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,1536,7168,torch.float8_e4m3fnuz,ck,68,0,940.0969,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3837.66,190.17,0.0 -80,20480,1536,7168,torch.float8_e4m3fnuz,ck,93,0,1157.9543,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3894.55,190.62,0.0 -80,32768,1536,7168,torch.float8_e4m3fnuz,ck,68,0,1872.4274,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,1536,7168,torch.float8_e4m3fnuz,ck,93,0,3736.9156,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,1536,7168,torch.float8_e4m3fnuz,ck,102,0,5497.497,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,1536,7168,torch.float8_e4m3fnuz,ck,68,0,7429.8789,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1,2048,7168,torch.float8_e4m3fnuz,ck,10,0,12.1709,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.41,1207.09,0.0 -80,2,2048,7168,torch.float8_e4m3fnuz,ck,10,0,11.7272,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.01,1253.72,0.0 -80,4,2048,7168,torch.float8_e4m3fnuz,ck,10,0,12.2432,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.59,1202.72,0.0 -80,8,2048,7168,torch.float8_e4m3fnuz,ck,10,0,12.4565,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,18.86,1185.74,0.0 -80,16,2048,7168,torch.float8_e4m3fnuz,ck,10,0,11.2137,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,41.89,1325.19,0.0 -80,32,2048,7168,torch.float8_e4m3fnuz,ck,11,0,12.645,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,74.3,1189.44,0.0 -80,64,2048,7168,torch.float8_e4m3fnuz,ck,6,0,18.2067,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,103.21,845.9,0.0 -80,96,2048,7168,torch.float8_e4m3fnuz,ck,113,0,21.5433,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,130.83,731.62,0.0 -80,128,2048,7168,torch.float8_e4m3fnuz,ck,114,0,24.2173,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,155.18,665.72,0.0 -80,160,2048,7168,torch.float8_e4m3fnuz,ck,119,0,24.4219,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,192.35,674.9,0.0 -80,192,2048,7168,torch.float8_e4m3fnuz,ck,120,0,29.915,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,188.44,563.02,0.0 -80,224,2048,7168,torch.float8_e4m3fnuz,ck,120,0,31.6736,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,207.64,543.14,0.0 -80,256,2048,7168,torch.float8_e4m3fnuz,ck,114,0,38.2291,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,196.61,459.43,0.0 -80,288,2048,7168,torch.float8_e4m3fnuz,ck,119,0,38.7116,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,218.43,463.02,0.0 -80,320,2048,7168,torch.float8_e4m3fnuz,ck,114,0,38.5684,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,243.6,474.08,0.0 -80,352,2048,7168,torch.float8_e4m3fnuz,ck,122,0,50.8883,a8w8_bpreshuffle_256x80x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,203.09,366.39,0.0 -80,384,2048,7168,torch.float8_e4m3fnuz,ck,123,0,50.7212,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,222.28,374.7,0.0 -80,416,2048,7168,torch.float8_e4m3fnuz,ck,120,0,52.9749,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,230.56,365.57,0.0 -80,448,2048,7168,torch.float8_e4m3fnuz,ck,123,0,53.8786,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,244.13,366.13,0.0 -80,480,2048,7168,torch.float8_e4m3fnuz,ck,123,0,51.3534,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,274.43,391.15,0.0 -80,512,2048,7168,torch.float8_e4m3fnuz,ck,85,0,60.8768,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,246.93,335.88,0.0 -80,1024,2048,7168,torch.float8_e4m3fnuz,ck,85,0,105.0089,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,286.31,249.64,0.0 -80,2048,2048,7168,torch.float8_e4m3fnuz,ck,85,0,184.1052,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,326.6,205.04,0.0 -80,4096,2048,7168,torch.float8_e4m3fnuz,ck,85,0,340.0794,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,353.62,178.83,0.0 -80,6144,2048,7168,torch.float8_e4m3fnuz,ck,85,0,507.8765,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,355.18,165.17,0.0 -80,8192,2048,7168,torch.float8_e4m3fnuz,ck,71,0,664.0129,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,362.22,161.07,0.0 -80,10240,2048,7168,torch.float8_e4m3fnuz,ck,85,0,828.0104,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,363.1,157.03,0.0 -80,12288,2048,7168,torch.float8_e4m3fnuz,ck,102,0,976.0123,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,369.64,156.85,0.0 -80,14336,2048,7168,torch.float8_e4m3fnuz,ck,102,0,1150.9875,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.69,153.05,0.0 -80,16384,2048,7168,torch.float8_e4m3fnuz,ck,85,0,1318.349,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,364.88,151.12,0.0 -80,32768,2048,7168,torch.float8_e4m3fnuz,ck,85,0,2600.154,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,370.01,147.6,0.0 -80,65536,2048,7168,torch.float8_e4m3fnuz,ck,85,0,5214.9027,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,368.97,144.37,0.0 -80,98304,2048,7168,torch.float8_e4m3fnuz,ck,102,0,7585.8107,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,380.48,147.9,0.0 -80,1,2112,7168,torch.float8_e4m3fnuz,ck,10,0,11.7937,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.57,1284.6,0.0 -80,16,2112,7168,torch.float8_e4m3fnuz,ck,10,0,10.8571,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,44.62,1411.16,0.0 -80,32,2112,7168,torch.float8_e4m3fnuz,ck,11,0,12.5633,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,77.12,1234.02,0.0 -80,48,2112,7168,torch.float8_e4m3fnuz,ck,10,0,18.4707,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,786.83,849.22,0.0 -80,64,2112,7168,torch.float8_e4m3fnuz,ck,12,0,18.5595,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,104.41,854.97,0.0 -80,80,2112,7168,torch.float8_e4m3fnuz,ck,113,0,23.5729,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,102.75,680.87,0.0 -80,96,2112,7168,torch.float8_e4m3fnuz,ck,113,0,21.5242,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,135.04,754.15,0.0 -80,112,2112,7168,torch.float8_e4m3fnuz,ck,112,0,25.9619,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1306.18,632.26,0.0 -80,128,2112,7168,torch.float8_e4m3fnuz,ck,114,0,23.6699,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,163.73,701.19,0.0 -80,160,2112,7168,torch.float8_e4m3fnuz,ck,115,0,28.9398,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1673.96,586.1,0.0 -80,192,2112,7168,torch.float8_e4m3fnuz,ck,126,0,32.1086,a8w8_bpreshuffle_256x32x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1810.51,539.61,0.0 -80,224,2112,7168,torch.float8_e4m3fnuz,ck,126,0,32.5198,a8w8_bpreshuffle_256x32x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,2085.56,544.0,0.0 -80,256,2112,7168,torch.float8_e4m3fnuz,ck,114,0,37.3908,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2072.99,482.88,0.0 -80,288,2112,7168,torch.float8_e4m3fnuz,ck,113,0,44.2402,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1971.05,416.36,0.0 -80,320,2112,7168,torch.float8_e4m3fnuz,ck,113,0,45.9122,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,2110.3,409.13,0.0 -80,352,2112,7168,torch.float8_e4m3fnuz,ck,128,0,49.7251,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2143.33,385.09,0.0 -80,384,2112,7168,torch.float8_e4m3fnuz,ck,128,0,47.5375,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2445.78,410.48,0.0 -80,512,2112,7168,torch.float8_e4m3fnuz,ck,129,0,60.4509,a8w8_bpreshuffle_256x80x192x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2564.42,346.92,0.0 -80,1024,2112,7168,torch.float8_e4m3fnuz,ck,93,0,115.8099,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2677.17,231.45,0.0 -80,1536,2112,7168,torch.float8_e4m3fnuz,ck,68,0,148.9129,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3123.06,219.17,0.0 -80,2048,2112,7168,torch.float8_e4m3fnuz,ck,93,0,186.5271,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3324.37,206.24,0.0 -80,4096,2112,7168,torch.float8_e4m3fnuz,ck,93,0,329.2957,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3766.13,187.67,0.0 -80,8192,2112,7168,torch.float8_e4m3fnuz,ck,68,0,649.3748,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3819.59,167.03,0.0 -80,16384,2112,7168,torch.float8_e4m3fnuz,ck,93,0,1284.8613,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3860.87,157.05,0.0 -80,32768,2112,7168,torch.float8_e4m3fnuz,ck,93,0,2566.2639,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3866.08,151.36,0.0 -80,49152,2112,7168,torch.float8_e4m3fnuz,ck,93,0,3805.7733,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3910.39,151.11,0.0 -80,65536,2112,7168,torch.float8_e4m3fnuz,ck,93,0,5062.5452,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3919.52,150.46,0.0 -80,73728,2112,7168,torch.float8_e4m3fnuz,ck,93,0,5700.3689,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3916.08,150.0,0.0 -80,98304,2112,7168,torch.float8_e4m3fnuz,ck,95,0,7892.5471,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,377.12,143.81,0.0 -80,131072,2112,7168,torch.float8_e4m3fnuz,ck,93,0,10093.2233,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3931.9,149.44,0.0 -80,1,2240,7168,torch.float8_e4m3fnuz,ck,10,0,11.693,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.75,1374.15,0.0 -80,2,2240,7168,torch.float8_e4m3fnuz,ck,10,0,11.9922,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.36,1340.84,0.0 -80,4,2240,7168,torch.float8_e4m3fnuz,ck,10,0,11.9683,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.73,1345.46,0.0 -80,8,2240,7168,torch.float8_e4m3fnuz,ck,10,0,12.2626,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,20.95,1316.97,0.0 -80,16,2240,7168,torch.float8_e4m3fnuz,ck,10,0,11.1279,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,461.72,1459.64,0.0 -80,32,2240,7168,torch.float8_e4m3fnuz,ck,11,0,12.5136,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,82.12,1312.9,0.0 -80,48,2240,7168,torch.float8_e4m3fnuz,ck,10,0,18.3626,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,839.43,904.85,0.0 -80,64,2240,7168,torch.float8_e4m3fnuz,ck,12,0,18.5811,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,110.61,904.24,0.0 -80,80,2240,7168,torch.float8_e4m3fnuz,ck,113,0,23.5984,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1088.64,719.89,0.0 -80,96,2240,7168,torch.float8_e4m3fnuz,ck,113,0,21.5933,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,142.77,795.36,0.0 -80,112,2240,7168,torch.float8_e4m3fnuz,ck,112,0,26.1124,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1377.36,664.85,0.0 -80,128,2240,7168,torch.float8_e4m3fnuz,ck,114,0,23.9804,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1714.07,731.73,0.0 -80,160,2240,7168,torch.float8_e4m3fnuz,ck,115,0,28.9759,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.0,0.0,0.0 -80,192,2240,7168,torch.float8_e4m3fnuz,ck,113,0,32.2862,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1909.68,566.58,0.0 -80,224,2240,7168,torch.float8_e4m3fnuz,ck,117,0,37.5099,a8w8_bpreshuffle_256x112x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.0,0.0,0.0 -80,256,2240,7168,torch.float8_e4m3fnuz,ck,114,0,37.1525,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2212.73,512.43,0.0 -80,288,2240,7168,torch.float8_e4m3fnuz,ck,113,0,44.5,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,320,2240,7168,torch.float8_e4m3fnuz,ck,112,0,53.8671,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,190.77,367.27,0.0 -80,352,2240,7168,torch.float8_e4m3fnuz,ck,78,0,54.0168,a8w8_bpreshuffle_256x96x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,384,2240,7168,torch.float8_e4m3fnuz,ck,78,0,53.062,a8w8_bpreshuffle_256x96x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,2240,7168,torch.float8_e4m3fnuz,ck,113,0,62.1588,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,448,2240,7168,torch.float8_e4m3fnuz,ck,114,0,70.1253,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,2240,7168,torch.float8_e4m3fnuz,ck,62,0,70.4585,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,2240,7168,torch.float8_e4m3fnuz,ck,62,0,66.3638,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2477.51,331.81,0.0 -80,1024,2240,7168,torch.float8_e4m3fnuz,ck,114,0,120.7469,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2723.33,231.76,0.0 -80,1536,2240,7168,torch.float8_e4m3fnuz,ck,48,0,177.0619,a8w8_bpreshuffle_256x192x224x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2785.75,191.73,0.0 -80,2048,2240,7168,torch.float8_e4m3fnuz,ck,79,0,230.9106,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2848.14,172.84,0.0 -80,4096,2240,7168,torch.float8_e4m3fnuz,ck,69,0,437.1832,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3008.66,145.86,0.0 -80,6144,2240,7168,torch.float8_e4m3fnuz,ck,69,0,662.2802,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,2240,7168,torch.float8_e4m3fnuz,ck,69,0,857.9718,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3066.15,129.93,0.0 -80,10240,2240,7168,torch.float8_e4m3fnuz,ck,69,0,1018.0438,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,2240,7168,torch.float8_e4m3fnuz,ck,69,0,1231.0498,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,2240,7168,torch.float8_e4m3fnuz,ck,69,0,1461.7946,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,2240,7168,torch.float8_e4m3fnuz,ck,69,0,1644.8521,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3198.67,125.78,0.0 -80,32768,2240,7168,torch.float8_e4m3fnuz,ck,69,0,3196.032,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3292.42,124.45,0.0 -80,49152,2240,7168,torch.float8_e4m3fnuz,ck,69,0,4812.8899,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3279.53,122.29,0.0 -80,65536,2240,7168,torch.float8_e4m3fnuz,ck,69,0,6365.1402,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3306.34,122.45,0.0 -80,73728,2240,7168,torch.float8_e4m3fnuz,ck,69,0,7155.8623,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3308.62,122.26,0.0 -80,98304,2240,7168,torch.float8_e4m3fnuz,ck,69,0,9755.5399,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,2240,7168,torch.float8_e4m3fnuz,ck,69,0,12728.0402,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3306.93,121.21,0.0 -80,1,3072,1536,torch.float8_e4m3fnuz,ck,5,0,5.6562,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,2,3072,1536,torch.float8_e4m3fnuz,ck,11,0,5.911,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,3072,1536,torch.float8_e4m3fnuz,ck,11,0,5.8834,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,3072,1536,torch.float8_e4m3fnuz,ck,10,0,5.8254,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,16,3072,1536,torch.float8_e4m3fnuz,ck,11,0,5.8974,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,256.04,820.95,0.0 -80,32,3072,1536,torch.float8_e4m3fnuz,ck,112,0,6.9922,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,431.9,709.98,0.0 -80,64,3072,1536,torch.float8_e4m3fnuz,ck,114,0,9.0146,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,670.0,577.96,0.0 -80,96,3072,1536,torch.float8_e4m3fnuz,ck,112,0,9.6664,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,937.24,564.42,0.0 -80,128,3072,1536,torch.float8_e4m3fnuz,ck,112,0,11.1838,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1080.1,509.81,0.0 -80,160,3072,1536,torch.float8_e4m3fnuz,ck,112,0,11.4913,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1313.99,517.56,0.0 -80,192,3072,1536,torch.float8_e4m3fnuz,ck,119,0,12.4325,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1457.42,498.14,0.0 -80,224,3072,1536,torch.float8_e4m3fnuz,ck,113,0,14.6673,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1441.25,439.0,0.0 -80,256,3072,1536,torch.float8_e4m3fnuz,ck,120,0,16.0983,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1500.73,415.24,0.0 -80,288,3072,1536,torch.float8_e4m3fnuz,ck,120,0,16.2173,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1675.93,427.35,0.0 -80,320,3072,1536,torch.float8_e4m3fnuz,ck,119,0,16.7997,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1797.59,427.16,0.0 -80,352,3072,1536,torch.float8_e4m3fnuz,ck,85,0,18.9609,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1751.97,391.43,0.0 -80,384,3072,1536,torch.float8_e4m3fnuz,ck,85,0,18.7893,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1928.69,408.09,0.0 -80,416,3072,1536,torch.float8_e4m3fnuz,ck,100,0,20.7247,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,448,3072,1536,torch.float8_e4m3fnuz,ck,92,0,22.3115,a8w8_bpreshuffle_256x32x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,480,3072,1536,torch.float8_e4m3fnuz,ck,94,0,22.3915,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,3072,1536,torch.float8_e4m3fnuz,ck,85,0,24.3751,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1982.28,354.9,0.0 -80,1024,3072,1536,torch.float8_e4m3fnuz,ck,85,0,38.8336,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2488.48,324.02,0.0 -80,1536,3072,1536,torch.float8_e4m3fnuz,ck,93,0,51.4668,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2816.48,320.89,0.0 -80,2048,3072,1536,torch.float8_e4m3fnuz,ck,72,0,69.8541,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2766.82,292.71,0.0 -80,4096,3072,1536,torch.float8_e4m3fnuz,ck,93,0,123.6652,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3125.75,292.53,0.0 -80,6144,3072,1536,torch.float8_e4m3fnuz,ck,85,0,183.0209,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,3072,1536,torch.float8_e4m3fnuz,ck,71,0,235.4545,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3283.41,287.25,0.0 -80,10240,3072,1536,torch.float8_e4m3fnuz,ck,71,0,290.0173,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,3072,1536,torch.float8_e4m3fnuz,ck,72,0,352.0628,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,14336,3072,1536,torch.float8_e4m3fnuz,ck,93,0,405.8954,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,3072,1536,torch.float8_e4m3fnuz,ck,71,0,450.9699,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3428.58,289.48,0.0 -80,20480,3072,1536,torch.float8_e4m3fnuz,ck,71,0,557.954,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3463.97,290.36,0.0 -80,32768,3072,1536,torch.float8_e4m3fnuz,ck,71,0,909.865,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,3072,1536,torch.float8_e4m3fnuz,ck,71,0,1787.3796,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,3072,1536,torch.float8_e4m3fnuz,ck,102,0,2698.4705,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,3072,1536,torch.float8_e4m3fnuz,ck,71,0,3559.9187,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1,4096,512,torch.float8_e4m3fnuz,ck,9,0,4.2138,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.95,499.75,0.0 -80,1,4096,7168,torch.float8_e4m3fnuz,ck,11,0,13.4343,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,2,4096,512,torch.float8_e4m3fnuz,ck,23,0,4.4526,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,2,4096,7168,torch.float8_e4m3fnuz,ck,11,0,13.2939,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,4096,512,torch.float8_e4m3fnuz,ck,23,0,4.5342,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,4,4096,7168,torch.float8_e4m3fnuz,ck,5,0,13.5363,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,4096,512,torch.float8_e4m3fnuz,ck,9,0,4.1858,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,4096,7168,torch.float8_e4m3fnuz,ck,11,0,13.6103,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,16,4096,512,torch.float8_e4m3fnuz,ck,9,0,4.2875,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,15.65,521.61,0.0 -80,16,4096,7168,torch.float8_e4m3fnuz,ck,11,0,13.1683,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,32,4096,512,torch.float8_e4m3fnuz,ck,9,0,4.8822,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,274.91,486.6,0.0 -80,32,4096,7168,torch.float8_e4m3fnuz,ck,12,0,18.5983,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,48,4096,512,torch.float8_e4m3fnuz,ck,9,0,5.3534,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,376.07,469.78,0.0 -80,64,4096,512,torch.float8_e4m3fnuz,ck,77,0,5.8414,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,459.54,454.38,0.0 -80,64,4096,7168,torch.float8_e4m3fnuz,ck,114,0,23.8807,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,80,4096,512,torch.float8_e4m3fnuz,ck,76,0,6.8102,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,492.71,410.19,0.0 -80,96,4096,512,torch.float8_e4m3fnuz,ck,76,0,6.6322,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,607.12,442.2,0.0 -80,96,4096,7168,torch.float8_e4m3fnuz,ck,120,0,29.6988,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,112,4096,512,torch.float8_e4m3fnuz,ck,76,0,7.3154,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,642.15,419.94,0.0 -80,128,4096,512,torch.float8_e4m3fnuz,ck,76,0,7.2018,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,745.47,445.9,0.0 -80,128,4096,7168,torch.float8_e4m3fnuz,ck,121,0,38.8588,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,160,4096,512,torch.float8_e4m3fnuz,ck,84,0,7.7118,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,160,4096,7168,torch.float8_e4m3fnuz,ck,119,0,40.6369,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,192,4096,512,torch.float8_e4m3fnuz,ck,84,0,9.9422,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,192,4096,7168,torch.float8_e4m3fnuz,ck,123,0,49.1457,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,224,4096,512,torch.float8_e4m3fnuz,ck,76,0,10.187,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,224,4096,7168,torch.float8_e4m3fnuz,ck,120,0,52.8765,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,256,4096,512,torch.float8_e4m3fnuz,ck,76,0,10.9814,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,977.78,393.88,0.0 -80,256,4096,7168,torch.float8_e4m3fnuz,ck,85,0,60.0334,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,288,4096,512,torch.float8_e4m3fnuz,ck,85,0,10.8502,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,288,4096,7168,torch.float8_e4m3fnuz,ck,85,0,62.9998,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,320,4096,512,torch.float8_e4m3fnuz,ck,84,0,10.783,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,320,4096,7168,torch.float8_e4m3fnuz,ck,121,0,63.2822,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,352,4096,512,torch.float8_e4m3fnuz,ck,84,0,12.0603,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,352,4096,7168,torch.float8_e4m3fnuz,ck,136,0,74.9351,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,384,4096,512,torch.float8_e4m3fnuz,ck,84,0,12.2599,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,384,4096,7168,torch.float8_e4m3fnuz,ck,86,0,81.866,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,4096,512,torch.float8_e4m3fnuz,ck,85,0,13.5035,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,4096,7168,torch.float8_e4m3fnuz,ck,85,0,86.134,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,448,4096,512,torch.float8_e4m3fnuz,ck,85,0,13.3723,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,448,4096,7168,torch.float8_e4m3fnuz,ck,85,0,84.5872,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,4096,512,torch.float8_e4m3fnuz,ck,86,0,14.0771,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,4096,7168,torch.float8_e4m3fnuz,ck,86,0,82.8028,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,4096,512,torch.float8_e4m3fnuz,ck,85,0,15.7231,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1365.81,416.81,0.0 -80,512,4096,7168,torch.float8_e4m3fnuz,ck,138,0,102.7541,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,1024,4096,512,torch.float8_e4m3fnuz,ck,85,0,24.9039,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1724.62,442.1,0.0 -80,1024,4096,7168,torch.float8_e4m3fnuz,ck,85,0,179.0322,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1536,4096,512,torch.float8_e4m3fnuz,ck,85,0,33.2015,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1940.41,465.84,0.0 -80,2048,4096,512,torch.float8_e4m3fnuz,ck,71,0,43.6672,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1967.14,456.25,0.0 -80,2048,4096,7168,torch.float8_e4m3fnuz,ck,85,0,328.802,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,4096,4096,512,torch.float8_e4m3fnuz,ck,71,0,75.7421,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2268.21,498.39,0.0 -80,4096,4096,7168,torch.float8_e4m3fnuz,ck,85,0,645.3373,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,6144,4096,512,torch.float8_e4m3fnuz,ck,71,0,109.7738,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,6144,4096,7168,torch.float8_e4m3fnuz,ck,102,0,937.0884,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,4096,512,torch.float8_e4m3fnuz,ck,71,0,140.1016,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2452.49,523.91,0.0 -80,8192,4096,7168,torch.float8_e4m3fnuz,ck,85,0,1259.5274,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,10240,4096,512,torch.float8_e4m3fnuz,ck,71,0,173.8713,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,10240,4096,7168,torch.float8_e4m3fnuz,ck,85,0,1574.2695,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,4096,512,torch.float8_e4m3fnuz,ck,71,0,207.219,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,4096,7168,torch.float8_e4m3fnuz,ck,102,0,1858.9016,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,4096,512,torch.float8_e4m3fnuz,ck,71,0,239.3595,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,4096,7168,torch.float8_e4m3fnuz,ck,85,0,2197.8676,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,4096,512,torch.float8_e4m3fnuz,ck,71,0,268.2002,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2562.25,539.54,0.0 -80,16384,4096,7168,torch.float8_e4m3fnuz,ck,85,0,2498.7773,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,20480,4096,512,torch.float8_e4m3fnuz,ck,71,0,331.5621,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2590.75,543.96,0.0 -80,32768,4096,512,torch.float8_e4m3fnuz,ck,71,0,523.3298,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2626.24,549.0,0.0 -80,32768,4096,7168,torch.float8_e4m3fnuz,ck,74,0,5016.1101,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,65536,4096,512,torch.float8_e4m3fnuz,ck,71,0,1055.1661,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,4096,7168,torch.float8_e4m3fnuz,ck,85,0,9976.6212,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,4096,512,torch.float8_e4m3fnuz,ck,71,0,1607.6098,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,4096,7168,torch.float8_e4m3fnuz,ck,102,0,14573.0079,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,4096,512,torch.float8_e4m3fnuz,ck,71,0,2125.6078,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,4096,7168,torch.float8_e4m3fnuz,ck,85,0,19941.3998,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1,4608,4096,torch.float8_e4m3fnuz,ck,5,0,9.6645,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,3.91,1954.34,0.0 -80,1,4608,7168,torch.float8_e4m3fnuz,ck,5,0,13.8515,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,47.69,2385.77,0.0 -80,2,4608,4096,torch.float8_e4m3fnuz,ck,11,0,9.5889,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,7.87,1971.13,0.0 -80,2,4608,7168,torch.float8_e4m3fnuz,ck,5,0,13.597,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,4608,4096,torch.float8_e4m3fnuz,ck,11,0,9.7763,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,15.44,1936.07,0.0 -80,4,4608,7168,torch.float8_e4m3fnuz,ck,5,0,13.6563,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,4608,4096,torch.float8_e4m3fnuz,ck,10,0,9.6358,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,31.34,1969.83,0.0 -80,8,4608,7168,torch.float8_e4m3fnuz,ck,5,0,13.8099,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,16,4608,4096,torch.float8_e4m3fnuz,ck,10,0,9.4967,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,63.6,2009.89,0.0 -80,16,4608,7168,torch.float8_e4m3fnuz,ck,11,0,13.1327,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,80.48,2535.07,0.0 -80,32,4608,4096,torch.float8_e4m3fnuz,ck,12,0,12.9381,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,93.36,1491.75,0.0 -80,32,4608,7168,torch.float8_e4m3fnuz,ck,12,0,19.2855,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,109.61,1739.88,0.0 -80,48,4608,7168,torch.float8_e4m3fnuz,ck,113,0,21.7204,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,145.99,1556.9,0.0 -80,64,4608,4096,torch.float8_e4m3fnuz,ck,114,0,16.3715,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1475.69,1204.92,0.0 -80,64,4608,7168,torch.float8_e4m3fnuz,ck,114,0,23.8378,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,177.36,1429.61,0.0 -80,80,4608,7168,torch.float8_e4m3fnuz,ck,115,0,28.763,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,183.74,1193.92,0.0 -80,96,4608,7168,torch.float8_e4m3fnuz,ck,120,0,29.4707,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2151.9,1174.15,0.0 -80,112,4608,7168,torch.float8_e4m3fnuz,ck,117,0,37.2568,a8w8_bpreshuffle_256x112x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1985.88,935.81,0.0 -80,128,4608,4096,torch.float8_e4m3fnuz,ck,114,0,24.38,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,198.19,844.06,0.0 -80,128,4608,7168,torch.float8_e4m3fnuz,ck,114,0,38.4671,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,219.82,913.18,0.0 -80,160,4608,7168,torch.float8_e4m3fnuz,ck,122,0,46.244,a8w8_bpreshuffle_256x80x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,192,4608,7168,torch.float8_e4m3fnuz,ck,128,0,47.9264,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,224,4608,7168,torch.float8_e4m3fnuz,ck,124,0,56.3824,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,256,4608,4096,torch.float8_e4m3fnuz,ck,85,0,37.982,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2544.28,586.65,0.0 -80,256,4608,7168,torch.float8_e4m3fnuz,ck,121,0,62.2609,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2716.22,597.88,0.0 -80,288,4608,7168,torch.float8_e4m3fnuz,ck,130,0,66.7012,a8w8_bpreshuffle_256x96x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,320,4608,7168,torch.float8_e4m3fnuz,ck,136,0,72.1189,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,352,4608,7168,torch.float8_e4m3fnuz,ck,128,0,84.2541,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,384,4608,7168,torch.float8_e4m3fnuz,ck,93,0,81.9457,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,4608,7168,torch.float8_e4m3fnuz,ck,138,0,101.4186,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,448,4608,7168,torch.float8_e4m3fnuz,ck,138,0,97.0854,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,480,4608,7168,torch.float8_e4m3fnuz,ck,56,0,105.9346,a8w8_bpreshuffle_256x160x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,4608,7168,torch.float8_e4m3fnuz,ck,70,0,105.5819,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3203.47,392.29,0.0 -80,1024,4608,4096,torch.float8_e4m3fnuz,ck,71,0,120.44,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3209.46,269.89,0.0 -80,1024,4608,7168,torch.float8_e4m3fnuz,ck,93,0,189.1551,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3576.2,263.31,0.0 -80,1536,4608,7168,torch.float8_e4m3fnuz,ck,85,0,279.9019,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3625.15,207.92,0.0 -80,2048,4608,4096,torch.float8_e4m3fnuz,ck,68,0,218.9032,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3531.67,210.77,0.0 -80,2048,4608,7168,torch.float8_e4m3fnuz,ck,93,0,366.0791,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3695.69,181.89,0.0 -80,4096,4608,4096,torch.float8_e4m3fnuz,ck,85,0,420.6946,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3675.32,174.47,0.0 -80,4096,4608,7168,torch.float8_e4m3fnuz,ck,85,0,715.7879,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3780.21,139.9,0.0 -80,6144,4608,7168,torch.float8_e4m3fnuz,ck,102,0,1071.1237,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,4608,7168,torch.float8_e4m3fnuz,ck,93,0,1405.0902,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3851.47,119.03,0.0 -80,10240,4608,7168,torch.float8_e4m3fnuz,ck,93,0,1722.172,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,4608,7168,torch.float8_e4m3fnuz,ck,68,0,2079.1883,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,4608,7168,torch.float8_e4m3fnuz,ck,93,0,2431.1066,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,4608,4096,torch.float8_e4m3fnuz,ck,93,0,1613.6388,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3832.8,146.86,0.0 -80,16384,4608,7168,torch.float8_e4m3fnuz,ck,93,0,2770.8265,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3906.17,108.8,0.0 -80,20480,4608,7168,torch.float8_e4m3fnuz,ck,68,0,3421.2891,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3954.4,107.73,0.0 -80,32768,4608,4096,torch.float8_e4m3fnuz,ck,93,0,3207.2182,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3856.77,141.89,0.0 -80,32768,4608,7168,torch.float8_e4m3fnuz,ck,68,0,5527.6346,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3916.08,103.1,0.0 -80,65536,4608,7168,torch.float8_e4m3fnuz,ck,68,0,11016.6648,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,4608,7168,torch.float8_e4m3fnuz,ck,102,0,16377.2032,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,4608,7168,torch.float8_e4m3fnuz,ck,93,0,21987.1209,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1,6144,1536,torch.float8_e4m3fnuz,ck,10,0,7.4756,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.52,1264.25,0.0 -80,2,6144,1536,torch.float8_e4m3fnuz,ck,10,0,7.3933,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.11,1280.19,0.0 -80,4,6144,1536,torch.float8_e4m3fnuz,ck,10,0,7.5748,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.97,1253.17,0.0 -80,8,6144,1536,torch.float8_e4m3fnuz,ck,10,0,7.6156,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,19.83,1253.71,0.0 -80,16,6144,1536,torch.float8_e4m3fnuz,ck,10,0,7.8653,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,38.4,1227.97,0.0 -80,32,6144,1536,torch.float8_e4m3fnuz,ck,112,0,9.748,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,61.96,1013.5,0.0 -80,64,6144,1536,torch.float8_e4m3fnuz,ck,112,0,12.3918,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,97.48,832.96,0.0 -80,96,6144,1536,torch.float8_e4m3fnuz,ck,119,0,13.368,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,135.54,805.23,0.0 -80,128,6144,1536,torch.float8_e4m3fnuz,ck,112,0,17.1016,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,141.27,655.3,0.0 -80,160,6144,1536,torch.float8_e4m3fnuz,ck,119,0,17.7021,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,170.6,658.06,0.0 -80,192,6144,1536,torch.float8_e4m3fnuz,ck,121,0,20.6837,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,175.2,584.59,0.0 -80,224,6144,1536,torch.float8_e4m3fnuz,ck,92,0,24.1353,a8w8_bpreshuffle_256x32x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,175.17,519.31,0.0 -80,256,6144,1536,torch.float8_e4m3fnuz,ck,85,0,26.6995,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,180.97,486.01,0.0 -80,288,6144,1536,torch.float8_e4m3fnuz,ck,86,0,27.4257,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,198.2,489.27,0.0 -80,320,6144,1536,torch.float8_e4m3fnuz,ck,93,0,28.1164,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,214.81,492.98,0.0 -80,352,6144,1536,torch.float8_e4m3fnuz,ck,85,0,33.8656,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,196.18,422.35,0.0 -80,384,6144,1536,torch.float8_e4m3fnuz,ck,85,0,33.7439,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,214.79,436.99,0.0 -80,416,6144,1536,torch.float8_e4m3fnuz,ck,93,0,37.9824,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,206.72,399.87,0.0 -80,448,6144,1536,torch.float8_e4m3fnuz,ck,93,0,37.7868,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,223.77,413.65,0.0 -80,480,6144,1536,torch.float8_e4m3fnuz,ck,94,0,38.3828,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,236.04,418.75,0.0 -80,512,6144,1536,torch.float8_e4m3fnuz,ck,85,0,42.8776,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,225.38,385.17,0.0 -80,1024,6144,1536,torch.float8_e4m3fnuz,ck,85,0,74.8161,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,258.33,315.35,0.0 -80,2048,6144,1536,torch.float8_e4m3fnuz,ck,71,0,137.2984,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,281.54,274.94,0.0 -80,4096,6144,1536,torch.float8_e4m3fnuz,ck,71,0,262.0027,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,295.07,252.14,0.0 -80,6144,6144,1536,torch.float8_e4m3fnuz,ck,71,0,375.4724,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,308.85,251.34,0.0 -80,8192,6144,1536,torch.float8_e4m3fnuz,ck,71,0,500.5056,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,308.93,245.12,0.0 -80,10240,6144,1536,torch.float8_e4m3fnuz,ck,71,0,617.2362,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,313.13,244.63,0.0 -80,12288,6144,1536,torch.float8_e4m3fnuz,ck,71,0,738.7716,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,313.94,242.71,0.0 -80,14336,6144,1536,torch.float8_e4m3fnuz,ck,71,0,860.2231,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,314.55,241.35,0.0 -80,16384,6144,1536,torch.float8_e4m3fnuz,ck,71,0,983.6366,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,314.38,239.85,0.0 -80,32768,6144,1536,torch.float8_e4m3fnuz,ck,71,0,1953.3257,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,316.63,236.74,0.0 -80,65536,6144,1536,torch.float8_e4m3fnuz,ck,71,0,3873.7845,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,319.31,236.31,0.0 -80,98304,6144,1536,torch.float8_e4m3fnuz,ck,71,0,5774.5261,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.31,236.97,0.0 -80,1,7168,256,torch.float8_e4m3fnuz,ck,75,0,6.4606,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,5.68,286.29,0.0 -80,1,7168,512,torch.float8_e4m3fnuz,ck,9,0,5.6245,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.31,655.15,0.0 -80,1,7168,1024,torch.float8_e4m3fnuz,ck,15,0,6.6648,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,2.2,1103.62,0.0 -80,1,7168,2048,torch.float8_e4m3fnuz,ck,10,0,9.1526,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,1,7168,2304,torch.float8_e4m3fnuz,ck,108,0,9.2341,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,3.58,1790.29,0.0 -80,1,7168,4096,torch.float8_e4m3fnuz,ck,10,0,11.7478,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.0,2500.77,0.0 -80,1,7168,4608,torch.float8_e4m3fnuz,ck,24,0,13.6306,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,4.85,2424.62,0.0 -80,1,7168,8192,torch.float8_e4m3fnuz,ck,10,0,20.2876,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.79,2895.5,0.0 -80,1,7168,9216,torch.float8_e4m3fnuz,ck,10,0,22.841,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.78,2893.21,0.0 -80,1,7168,16384,torch.float8_e4m3fnuz,ck,24,0,39.678,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,1,7168,18432,torch.float8_e4m3fnuz,ck,10,0,42.556,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,2,7168,256,torch.float8_e4m3fnuz,ck,75,0,6.3266,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,2,7168,512,torch.float8_e4m3fnuz,ck,9,0,5.6086,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.62,659.65,0.0 -80,2,7168,1024,torch.float8_e4m3fnuz,ck,15,0,6.7221,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,4.37,1096.5,0.0 -80,2,7168,2048,torch.float8_e4m3fnuz,ck,5,0,8.5166,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,2,7168,2304,torch.float8_e4m3fnuz,ck,29,0,9.905,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,2,7168,4096,torch.float8_e4m3fnuz,ck,10,0,11.8135,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.94,2488.42,0.0 -80,2,7168,4608,torch.float8_e4m3fnuz,ck,10,0,13.4089,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.85,2466.13,0.0 -80,2,7168,8192,torch.float8_e4m3fnuz,ck,10,0,20.503,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,11.46,2866.18,0.0 -80,2,7168,9216,torch.float8_e4m3fnuz,ck,10,0,22.8918,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,11.54,2887.82,0.0 -80,2,7168,16384,torch.float8_e4m3fnuz,ck,10,0,37.3783,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,2,7168,18432,torch.float8_e4m3fnuz,ck,10,0,42.2924,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,7168,256,torch.float8_e4m3fnuz,ck,73,0,6.9558,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,4,7168,512,torch.float8_e4m3fnuz,ck,9,0,5.6728,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.18,657.42,0.0 -80,4,7168,1024,torch.float8_e4m3fnuz,ck,108,0,6.7173,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,8.74,1101.85,0.0 -80,4,7168,2048,torch.float8_e4m3fnuz,ck,108,0,8.5722,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,7168,2304,torch.float8_e4m3fnuz,ck,108,0,9.9551,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,7168,4096,torch.float8_e4m3fnuz,ck,10,0,11.9762,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,19.61,2457.7,0.0 -80,4,7168,4608,torch.float8_e4m3fnuz,ck,24,0,13.4779,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,19.61,2456.31,0.0 -80,4,7168,8192,torch.float8_e4m3fnuz,ck,10,0,20.5581,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,22.85,2860.69,0.0 -80,4,7168,9216,torch.float8_e4m3fnuz,ck,10,0,23.1344,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,22.84,2859.57,0.0 -80,4,7168,16384,torch.float8_e4m3fnuz,ck,10,0,37.6428,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,7168,18432,torch.float8_e4m3fnuz,ck,10,0,42.1052,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,7168,256,torch.float8_e4m3fnuz,ck,75,0,6.2842,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,8,7168,512,torch.float8_e4m3fnuz,ck,9,0,5.6701,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.36,668.21,0.0 -80,8,7168,1024,torch.float8_e4m3fnuz,ck,15,0,6.6192,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,17.74,1127.46,0.0 -80,8,7168,2048,torch.float8_e4m3fnuz,ck,24,0,8.685,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,8,7168,2304,torch.float8_e4m3fnuz,ck,108,0,10.0038,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,7168,4096,torch.float8_e4m3fnuz,ck,10,0,11.8869,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,39.52,2482.36,0.0 -80,8,7168,4608,torch.float8_e4m3fnuz,ck,24,0,13.8728,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,38.09,2391.85,0.0 -80,8,7168,8192,torch.float8_e4m3fnuz,ck,10,0,20.8381,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,45.09,2826.58,0.0 -80,8,7168,9216,torch.float8_e4m3fnuz,ck,24,0,24.049,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,43.95,2754.74,0.0 -80,8,7168,16384,torch.float8_e4m3fnuz,ck,10,0,39.2255,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,7168,18432,torch.float8_e4m3fnuz,ck,24,0,45.0508,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,16,7168,256,torch.float8_e4m3fnuz,ck,75,0,6.629,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,88.58,312.03,0.0 -80,16,7168,512,torch.float8_e4m3fnuz,ck,9,0,5.4877,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,21.4,712.06,0.0 -80,16,7168,1024,torch.float8_e4m3fnuz,ck,15,0,6.8597,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,34.24,1105.85,0.0 -80,16,7168,2048,torch.float8_e4m3fnuz,ck,10,0,8.3657,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,56.15,1786.13,0.0 -80,16,7168,2304,torch.float8_e4m3fnuz,ck,108,0,9.9995,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,52.85,1678.22,0.0 -80,16,7168,4096,torch.float8_e4m3fnuz,ck,10,0,12.6967,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,74.0,2335.65,0.0 -80,16,7168,4608,torch.float8_e4m3fnuz,ck,20,0,15.3599,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,68.81,2170.15,0.0 -80,16,7168,8192,torch.float8_e4m3fnuz,ck,10,0,22.3668,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,84.01,2641.45,0.0 -80,16,7168,9216,torch.float8_e4m3fnuz,ck,6,0,24.8784,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,84.97,2670.47,0.0 -80,16,7168,16384,torch.float8_e4m3fnuz,ck,6,0,42.1056,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,16,7168,18432,torch.float8_e4m3fnuz,ck,6,0,45.8424,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,32,7168,256,torch.float8_e4m3fnuz,ck,75,0,6.8302,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,171.94,337.03,0.0 -80,32,7168,512,torch.float8_e4m3fnuz,ck,76,0,6.5317,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,35.96,634.62,0.0 -80,32,7168,1024,torch.float8_e4m3fnuz,ck,112,0,8.1404,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,57.71,962.06,0.0 -80,32,7168,2048,torch.float8_e4m3fnuz,ck,119,0,10.94,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,85.88,1389.79,0.0 -80,32,7168,2304,torch.float8_e4m3fnuz,ck,119,0,11.6614,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,906.38,1461.88,0.0 -80,32,7168,4096,torch.float8_e4m3fnuz,ck,119,0,17.0986,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,109.89,1751.6,0.0 -80,32,7168,4608,torch.float8_e4m3fnuz,ck,119,0,17.923,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,117.95,1876.71,0.0 -80,32,7168,8192,torch.float8_e4m3fnuz,ck,119,0,29.6463,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,126.76,2005.01,0.0 -80,32,7168,9216,torch.float8_e4m3fnuz,ck,119,0,31.763,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,133.11,2103.52,0.0 -80,32,7168,16384,torch.float8_e4m3fnuz,ck,119,0,53.2367,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1411.84,2224.47,0.0 -80,32,7168,18432,torch.float8_e4m3fnuz,ck,119,0,58.5048,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,48,7168,256,torch.float8_e4m3fnuz,ck,75,0,7.0126,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,251.21,361.55,0.0 -80,48,7168,2304,torch.float8_e4m3fnuz,ck,113,0,13.6727,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1159.57,1266.3,0.0 -80,64,7168,256,torch.float8_e4m3fnuz,ck,75,0,7.6886,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,305.49,360.13,0.0 -80,64,7168,512,torch.float8_e4m3fnuz,ck,76,0,8.1001,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,57.99,570.4,0.0 -80,64,7168,1024,torch.float8_e4m3fnuz,ck,112,0,10.2444,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,91.71,812.45,0.0 -80,64,7168,2048,torch.float8_e4m3fnuz,ck,112,0,14.2819,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1315.69,1101.3,0.0 -80,64,7168,2304,torch.float8_e4m3fnuz,ck,112,0,15.5151,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1362.5,1133.09,0.0 -80,64,7168,4096,torch.float8_e4m3fnuz,ck,112,0,24.3757,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,154.17,1252.88,0.0 -80,64,7168,4608,torch.float8_e4m3fnuz,ck,112,0,27.0894,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,156.07,1264.06,0.0 -80,64,7168,8192,torch.float8_e4m3fnuz,ck,112,0,43.5233,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,172.69,1382.3,0.0 -80,64,7168,9216,torch.float8_e4m3fnuz,ck,121,0,48.7661,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,173.39,1385.54,0.0 -80,64,7168,16384,torch.float8_e4m3fnuz,ck,121,0,81.4496,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1845.61,1466.02,0.0 -80,64,7168,18432,torch.float8_e4m3fnuz,ck,121,0,89.7754,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,80,7168,256,torch.float8_e4m3fnuz,ck,75,0,8.2998,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,353.75,361.74,0.0 -80,80,7168,2304,torch.float8_e4m3fnuz,ck,113,0,18.9055,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1397.69,943.97,0.0 -80,96,7168,256,torch.float8_e4m3fnuz,ck,75,0,8.6194,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,408.75,375.41,0.0 -80,96,7168,512,torch.float8_e4m3fnuz,ck,76,0,9.4912,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,74.24,536.86,0.0 -80,96,7168,1024,torch.float8_e4m3fnuz,ck,113,0,12.4833,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,112.89,706.11,0.0 -80,96,7168,2048,torch.float8_e4m3fnuz,ck,113,0,17.6925,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1593.09,918.63,0.0 -80,96,7168,2304,torch.float8_e4m3fnuz,ck,84,0,19.1495,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1655.86,945.85,0.0 -80,96,7168,4096,torch.float8_e4m3fnuz,ck,113,0,29.9117,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,188.46,1040.72,0.0 -80,96,7168,4608,torch.float8_e4m3fnuz,ck,113,0,33.003,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,192.16,1055.93,0.0 -80,96,7168,8192,torch.float8_e4m3fnuz,ck,113,0,53.0736,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,212.43,1147.14,0.0 -80,96,7168,9216,torch.float8_e4m3fnuz,ck,113,0,59.3348,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,213.76,1151.45,0.0 -80,96,7168,16384,torch.float8_e4m3fnuz,ck,113,0,98.4525,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,2290.3,1222.82,0.0 -80,96,7168,18432,torch.float8_e4m3fnuz,ck,113,0,109.8187,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,112,7168,256,torch.float8_e4m3fnuz,ck,75,0,9.207,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,446.44,376.81,0.0 -80,112,7168,2304,torch.float8_e4m3fnuz,ck,119,0,21.7835,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1698.25,843.7,0.0 -80,128,7168,256,torch.float8_e4m3fnuz,ck,73,0,10.1406,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,463.25,365.14,0.0 -80,128,7168,512,torch.float8_e4m3fnuz,ck,84,0,10.7725,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,87.22,517.11,0.0 -80,128,7168,1024,torch.float8_e4m3fnuz,ck,119,0,14.7575,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,127.33,630.6,0.0 -80,128,7168,2048,torch.float8_e4m3fnuz,ck,119,0,20.5111,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1832.23,817.96,0.0 -80,128,7168,2304,torch.float8_e4m3fnuz,ck,84,0,21.9275,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1928.11,850.3,0.0 -80,128,7168,4096,torch.float8_e4m3fnuz,ck,119,0,35.2197,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,213.41,900.62,0.0 -80,128,7168,4608,torch.float8_e4m3fnuz,ck,119,0,39.2577,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,215.39,903.13,0.0 -80,128,7168,8192,torch.float8_e4m3fnuz,ck,119,0,63.878,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,235.33,964.4,0.0 -80,128,7168,9216,torch.float8_e4m3fnuz,ck,119,0,71.9358,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,235.09,960.23,0.0 -80,128,7168,16384,torch.float8_e4m3fnuz,ck,114,0,119.9291,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2506.88,1012.04,0.0 -80,128,7168,18432,torch.float8_e4m3fnuz,ck,114,0,133.6804,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,160,7168,256,torch.float8_e4m3fnuz,ck,75,0,10.251,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,160,7168,512,torch.float8_e4m3fnuz,ck,76,0,12.0125,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,97.77,503.28,0.0 -80,160,7168,1024,torch.float8_e4m3fnuz,ck,112,0,17.3598,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,135.3,564.39,0.0 -80,160,7168,2048,torch.float8_e4m3fnuz,ck,119,0,25.3574,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1852.56,682.31,0.0 -80,160,7168,2304,torch.float8_e4m3fnuz,ck,119,0,27.2747,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,160,7168,4096,torch.float8_e4m3fnuz,ck,133,0,44.638,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,210.48,723.81,0.0 -80,160,7168,4608,torch.float8_e4m3fnuz,ck,133,0,49.6981,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,212.68,725.6,0.0 -80,160,7168,8192,torch.float8_e4m3fnuz,ck,133,0,81.2116,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,231.38,767.44,0.0 -80,160,7168,9216,torch.float8_e4m3fnuz,ck,133,0,90.3778,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,233.9,772.63,0.0 -80,160,7168,16384,torch.float8_e4m3fnuz,ck,136,0,150.746,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2493.0,811.67,0.0 -80,160,7168,18432,torch.float8_e4m3fnuz,ck,136,0,168.4653,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,192,7168,256,torch.float8_e4m3fnuz,ck,73,0,10.5659,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,192,7168,512,torch.float8_e4m3fnuz,ck,84,0,13.4126,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,105.07,486.17,0.0 -80,192,7168,1024,torch.float8_e4m3fnuz,ck,84,0,19.8942,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,141.68,517.19,0.0 -80,192,7168,2048,torch.float8_e4m3fnuz,ck,120,0,28.1354,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2003.58,633.57,0.0 -80,192,7168,2304,torch.float8_e4m3fnuz,ck,120,0,29.5015,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,192,7168,4096,torch.float8_e4m3fnuz,ck,120,0,48.3634,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,233.12,680.25,0.0 -80,192,7168,4608,torch.float8_e4m3fnuz,ck,120,0,53.0886,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,238.91,690.68,0.0 -80,192,7168,8192,torch.float8_e4m3fnuz,ck,120,0,87.1718,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,258.67,723.23,0.0 -80,192,7168,9216,torch.float8_e4m3fnuz,ck,120,0,96.3409,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,263.31,732.63,0.0 -80,192,7168,16384,torch.float8_e4m3fnuz,ck,120,0,162.2601,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2779.31,760.13,0.0 -80,192,7168,18432,torch.float8_e4m3fnuz,ck,120,0,181.5802,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,224,7168,256,torch.float8_e4m3fnuz,ck,73,0,10.9003,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,224,7168,512,torch.float8_e4m3fnuz,ck,84,0,14.9991,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,109.62,466.43,0.0 -80,224,7168,1024,torch.float8_e4m3fnuz,ck,84,0,20.7816,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,158.23,518.76,0.0 -80,224,7168,2048,torch.float8_e4m3fnuz,ck,85,0,30.5218,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2154.74,601.21,0.0 -80,224,7168,2304,torch.float8_e4m3fnuz,ck,85,0,32.8835,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,224,7168,4096,torch.float8_e4m3fnuz,ck,85,0,54.7818,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,240.1,611.31,0.0 -80,224,7168,4608,torch.float8_e4m3fnuz,ck,85,0,60.3092,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,245.36,618.04,0.0 -80,224,7168,8192,torch.float8_e4m3fnuz,ck,85,0,98.7952,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,266.27,645.44,0.0 -80,224,7168,9216,torch.float8_e4m3fnuz,ck,85,0,110.1364,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,268.71,647.71,0.0 -80,224,7168,16384,torch.float8_e4m3fnuz,ck,85,0,185.5598,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2835.39,669.98,0.0 -80,224,7168,18432,torch.float8_e4m3fnuz,ck,85,0,207.6011,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,256,7168,256,torch.float8_e4m3fnuz,ck,73,0,11.5078,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,816.42,484.07,0.0 -80,256,7168,512,torch.float8_e4m3fnuz,ck,85,0,16.1374,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,116.44,462.97,0.0 -80,256,7168,1024,torch.float8_e4m3fnuz,ck,85,0,21.584,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,174.11,522.25,0.0 -80,256,7168,2048,torch.float8_e4m3fnuz,ck,85,0,30.5235,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2462.43,618.36,0.0 -80,256,7168,2304,torch.float8_e4m3fnuz,ck,85,0,32.0436,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2638.82,648.33,0.0 -80,256,7168,4096,torch.float8_e4m3fnuz,ck,85,0,54.0567,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,278.09,630.43,0.0 -80,256,7168,4608,torch.float8_e4m3fnuz,ck,85,0,60.5172,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,279.45,625.93,0.0 -80,256,7168,8192,torch.float8_e4m3fnuz,ck,85,0,98.4777,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,305.3,654.84,0.0 -80,256,7168,9216,torch.float8_e4m3fnuz,ck,85,0,109.9894,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,307.51,655.42,0.0 -80,256,7168,16384,torch.float8_e4m3fnuz,ck,85,0,186.0958,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3231.11,673.34,0.0 -80,256,7168,18432,torch.float8_e4m3fnuz,ck,85,0,209.5107,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,288,7168,256,torch.float8_e4m3fnuz,ck,72,0,12.7115,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,288,7168,512,torch.float8_e4m3fnuz,ck,76,0,17.9905,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,117.5,441.69,0.0 -80,288,7168,1024,torch.float8_e4m3fnuz,ck,84,0,26.2028,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,161.35,448.95,0.0 -80,288,7168,2048,torch.float8_e4m3fnuz,ck,85,0,38.1402,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2217.01,508.61,0.0 -80,288,7168,2304,torch.float8_e4m3fnuz,ck,85,0,40.9768,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,288,7168,4096,torch.float8_e4m3fnuz,ck,85,0,69.139,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,244.6,501.43,0.0 -80,288,7168,4608,torch.float8_e4m3fnuz,ck,85,0,76.8637,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,247.52,500.7,0.0 -80,288,7168,8192,torch.float8_e4m3fnuz,ck,85,0,127.535,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,265.2,511.3,0.0 -80,288,7168,9216,torch.float8_e4m3fnuz,ck,85,0,142.1807,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,267.62,512.33,0.0 -80,288,7168,16384,torch.float8_e4m3fnuz,ck,85,0,243.1421,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2782.15,519.4,0.0 -80,288,7168,18432,torch.float8_e4m3fnuz,ck,85,0,272.1414,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,320,7168,256,torch.float8_e4m3fnuz,ck,74,0,13.0231,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,320,7168,512,torch.float8_e4m3fnuz,ck,84,0,19.0693,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,123.17,441.62,0.0 -80,320,7168,1024,torch.float8_e4m3fnuz,ck,85,0,26.7558,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,175.57,458.04,0.0 -80,320,7168,2048,torch.float8_e4m3fnuz,ck,85,0,38.8094,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2420.87,513.35,0.0 -80,320,7168,2304,torch.float8_e4m3fnuz,ck,85,0,40.2392,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,320,7168,4096,torch.float8_e4m3fnuz,ck,85,0,70.3183,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,267.22,501.41,0.0 -80,320,7168,4608,torch.float8_e4m3fnuz,ck,85,0,79.5688,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,265.67,491.3,0.0 -80,320,7168,8192,torch.float8_e4m3fnuz,ck,85,0,131.2114,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,286.42,502.47,0.0 -80,320,7168,9216,torch.float8_e4m3fnuz,ck,85,0,146.2403,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,289.1,503.26,0.0 -80,320,7168,16384,torch.float8_e4m3fnuz,ck,101,0,239.9889,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3131.89,530.32,0.0 -80,320,7168,18432,torch.float8_e4m3fnuz,ck,101,0,270.4958,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,352,7168,256,torch.float8_e4m3fnuz,ck,75,0,15.1695,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,352,7168,512,torch.float8_e4m3fnuz,ck,84,0,20.5103,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,125.97,433.76,0.0 -80,352,7168,1024,torch.float8_e4m3fnuz,ck,84,0,30.2515,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,170.81,421.36,0.0 -80,352,7168,2048,torch.float8_e4m3fnuz,ck,100,0,45.5887,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,2266.96,448.52,0.0 -80,352,7168,2304,torch.float8_e4m3fnuz,ck,100,0,49.1372,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,352,7168,4096,torch.float8_e4m3fnuz,ck,133,0,83.0217,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,248.97,431.79,0.0 -80,352,7168,4608,torch.float8_e4m3fnuz,ck,133,0,92.2196,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,252.15,430.48,0.0 -80,352,7168,8192,torch.float8_e4m3fnuz,ck,86,0,154.334,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,267.85,431.86,0.0 -80,352,7168,9216,torch.float8_e4m3fnuz,ck,133,0,171.5389,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,271.11,433.43,0.0 -80,352,7168,16384,torch.float8_e4m3fnuz,ck,86,0,289.7112,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2853.81,442.7,0.0 -80,352,7168,18432,torch.float8_e4m3fnuz,ck,86,0,324.6944,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,384,7168,256,torch.float8_e4m3fnuz,ck,72,0,15.1515,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,384,7168,512,torch.float8_e4m3fnuz,ck,84,0,21.3082,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,132.28,439.81,0.0 -80,384,7168,1024,torch.float8_e4m3fnuz,ck,84,0,31.9035,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,176.69,414.95,0.0 -80,384,7168,2048,torch.float8_e4m3fnuz,ck,86,0,45.0074,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2504.99,465.96,0.0 -80,384,7168,2304,torch.float8_e4m3fnuz,ck,86,0,48.0036,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,384,7168,4096,torch.float8_e4m3fnuz,ck,86,0,80.811,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,279.03,450.9,0.0 -80,384,7168,4608,torch.float8_e4m3fnuz,ck,86,0,88.8226,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,285.59,453.77,0.0 -80,384,7168,8192,torch.float8_e4m3fnuz,ck,86,0,144.2396,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,312.65,467.08,0.0 -80,384,7168,9216,torch.float8_e4m3fnuz,ck,86,0,160.4438,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,316.21,468.1,0.0 -80,384,7168,16384,torch.float8_e4m3fnuz,ck,86,0,268.8438,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3354.9,480.71,0.0 -80,384,7168,18432,torch.float8_e4m3fnuz,ck,86,0,300.0663,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,7168,256,torch.float8_e4m3fnuz,ck,74,0,15.9059,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,416,7168,512,torch.float8_e4m3fnuz,ck,84,0,22.965,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,132.96,428.77,0.0 -80,416,7168,1024,torch.float8_e4m3fnuz,ck,85,0,32.8392,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,185.96,418.09,0.0 -80,416,7168,2048,torch.float8_e4m3fnuz,ck,85,0,48.1552,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,7168,2304,torch.float8_e4m3fnuz,ck,85,0,51.2564,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,7168,4096,torch.float8_e4m3fnuz,ck,85,0,88.4594,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,276.15,418.59,0.0 -80,416,7168,4608,torch.float8_e4m3fnuz,ck,85,0,99.396,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,276.48,411.59,0.0 -80,416,7168,8192,torch.float8_e4m3fnuz,ck,85,0,163.693,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,298.46,415.97,0.0 -80,416,7168,9216,torch.float8_e4m3fnuz,ck,85,0,183.5982,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,299.36,413.17,0.0 -80,416,7168,16384,torch.float8_e4m3fnuz,ck,85,0,310.2738,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,7168,18432,torch.float8_e4m3fnuz,ck,85,0,352.9218,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,448,7168,256,torch.float8_e4m3fnuz,ck,72,0,15.6539,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,448,7168,512,torch.float8_e4m3fnuz,ck,84,0,23.4146,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,140.44,440.83,0.0 -80,448,7168,1024,torch.float8_e4m3fnuz,ck,85,0,32.9498,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,199.6,431.61,0.0 -80,448,7168,2048,torch.float8_e4m3fnuz,ck,85,0,47.8488,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,448,7168,2304,torch.float8_e4m3fnuz,ck,85,0,49.72,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,448,7168,4096,torch.float8_e4m3fnuz,ck,85,0,89.2889,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,294.62,421.3,0.0 -80,448,7168,4608,torch.float8_e4m3fnuz,ck,85,0,99.4928,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,297.46,417.29,0.0 -80,448,7168,8192,torch.float8_e4m3fnuz,ck,85,0,164.1085,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.6,419.31,0.0 -80,448,7168,9216,torch.float8_e4m3fnuz,ck,85,0,183.9126,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.84,416.57,0.0 -80,448,7168,16384,torch.float8_e4m3fnuz,ck,85,0,316.355,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,448,7168,18432,torch.float8_e4m3fnuz,ck,85,0,356.7658,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,7168,256,torch.float8_e4m3fnuz,ck,72,0,16.1387,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,480,7168,512,torch.float8_e4m3fnuz,ck,72,0,25.9264,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,135.89,416.45,0.0 -80,480,7168,1024,torch.float8_e4m3fnuz,ck,72,0,36.9129,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,190.89,398.58,0.0 -80,480,7168,2048,torch.float8_e4m3fnuz,ck,102,0,52.4824,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,7168,2304,torch.float8_e4m3fnuz,ck,102,0,56.9516,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,7168,4096,torch.float8_e4m3fnuz,ck,102,0,96.7743,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,291.25,394.81,0.0 -80,480,7168,4608,torch.float8_e4m3fnuz,ck,102,0,106.4225,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,297.95,395.81,0.0 -80,480,7168,8192,torch.float8_e4m3fnuz,ck,102,0,176.1263,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.06,394.79,0.0 -80,480,7168,9216,torch.float8_e4m3fnuz,ck,102,0,194.8844,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,325.41,396.98,0.0 -80,480,7168,16384,torch.float8_e4m3fnuz,ck,86,0,346.0471,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,7168,18432,torch.float8_e4m3fnuz,ck,102,0,379.6631,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,7168,256,torch.float8_e4m3fnuz,ck,72,0,16.1951,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1160.26,574.63,0.0 -80,512,7168,512,torch.float8_e4m3fnuz,ck,72,0,26.2156,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,143.35,429.98,0.0 -80,512,7168,1024,torch.float8_e4m3fnuz,ck,71,0,37.0911,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,202.64,409.92,0.0 -80,512,7168,2048,torch.float8_e4m3fnuz,ck,71,0,53.438,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2813.05,431.69,0.0 -80,512,7168,2304,torch.float8_e4m3fnuz,ck,72,0,57.4769,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2942.3,435.56,0.0 -80,512,7168,4096,torch.float8_e4m3fnuz,ck,71,0,99.0962,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,303.39,391.51,0.0 -80,512,7168,4608,torch.float8_e4m3fnuz,ck,71,0,109.7596,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,308.15,389.3,0.0 -80,512,7168,8192,torch.float8_e4m3fnuz,ck,71,0,182.1732,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,330.07,385.65,0.0 -80,512,7168,9216,torch.float8_e4m3fnuz,ck,71,0,204.5549,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,330.7,381.9,0.0 -80,512,7168,16384,torch.float8_e4m3fnuz,ck,71,0,346.2792,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,7168,18432,torch.float8_e4m3fnuz,ck,70,0,389.6687,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1024,7168,256,torch.float8_e4m3fnuz,ck,71,0,27.7779,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1352.91,603.98,0.0 -80,1024,7168,512,torch.float8_e4m3fnuz,ck,71,0,46.6251,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,161.2,404.81,0.0 -80,1024,7168,1024,torch.float8_e4m3fnuz,ck,71,0,66.699,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,225.38,345.86,0.0 -80,1024,7168,2048,torch.float8_e4m3fnuz,ck,71,0,98.0567,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3066.06,320.81,0.0 -80,1024,7168,2304,torch.float8_e4m3fnuz,ck,85,0,107.3839,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3149.71,312.47,0.0 -80,1024,7168,4096,torch.float8_e4m3fnuz,ck,71,0,186.692,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.08,258.36,0.0 -80,1024,7168,4608,torch.float8_e4m3fnuz,ck,71,0,208.1557,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,324.98,251.87,0.0 -80,1024,7168,8192,torch.float8_e4m3fnuz,ck,85,0,349.1505,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,344.43,234.25,0.0 -80,1024,7168,9216,torch.float8_e4m3fnuz,ck,85,0,390.4976,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,346.46,230.93,0.0 -80,1024,7168,16384,torch.float8_e4m3fnuz,ck,85,0,661.406,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1024,7168,18432,torch.float8_e4m3fnuz,ck,85,0,745.9611,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1536,7168,256,torch.float8_e4m3fnuz,ck,71,0,37.3284,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1510.15,649.59,0.0 -80,1536,7168,2048,torch.float8_e4m3fnuz,ck,85,0,138.0452,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3266.84,288.64,0.0 -80,1536,7168,2304,torch.float8_e4m3fnuz,ck,85,0,150.2725,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3376.15,279.99,0.0 -80,2048,7168,256,torch.float8_e4m3fnuz,ck,71,0,48.0096,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1565.56,660.69,0.0 -80,2048,7168,512,torch.float8_e4m3fnuz,ck,71,0,82.6364,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,181.91,412.39,0.0 -80,2048,7168,1024,torch.float8_e4m3fnuz,ck,85,0,122.5939,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,245.24,316.47,0.0 -80,2048,7168,2048,torch.float8_e4m3fnuz,ck,85,0,186.7051,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3220.56,258.35,0.0 -80,2048,7168,2304,torch.float8_e4m3fnuz,ck,85,0,199.0359,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3398.67,254.19,0.0 -80,2048,7168,4096,torch.float8_e4m3fnuz,ck,85,0,351.1592,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,342.46,191.11,0.0 -80,2048,7168,4608,torch.float8_e4m3fnuz,ck,85,0,390.2145,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,346.71,184.07,0.0 -80,2048,7168,8192,torch.float8_e4m3fnuz,ck,85,0,657.8671,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.6,159.39,0.0 -80,2048,7168,9216,torch.float8_e4m3fnuz,ck,85,0,738.9236,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,366.19,154.68,0.0 -80,2048,7168,16384,torch.float8_e4m3fnuz,ck,85,0,1256.6524,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,2048,7168,18432,torch.float8_e4m3fnuz,ck,85,0,1411.2909,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,4096,7168,256,torch.float8_e4m3fnuz,ck,71,0,85.7258,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1753.54,718.61,0.0 -80,4096,7168,512,torch.float8_e4m3fnuz,ck,71,0,151.6161,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,198.3,425.33,0.0 -80,4096,7168,1024,torch.float8_e4m3fnuz,ck,71,0,228.1081,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,263.6,307.99,0.0 -80,4096,7168,2048,torch.float8_e4m3fnuz,ck,71,0,345.3094,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3482.65,236.86,0.0 -80,4096,7168,2304,torch.float8_e4m3fnuz,ck,71,0,379.4759,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3565.22,223.13,0.0 -80,4096,7168,4096,torch.float8_e4m3fnuz,ck,85,0,678.039,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,354.73,154.65,0.0 -80,4096,7168,4608,torch.float8_e4m3fnuz,ck,85,0,756.3913,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,357.73,146.25,0.0 -80,4096,7168,8192,torch.float8_e4m3fnuz,ck,85,0,1274.608,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.4,118.46,0.0 -80,4096,7168,9216,torch.float8_e4m3fnuz,ck,85,0,1433.1369,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.61,113.41,0.0 -80,4096,7168,16384,torch.float8_e4m3fnuz,ck,85,0,2443.998,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,4096,7168,18432,torch.float8_e4m3fnuz,ck,85,0,2773.2274,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,6144,7168,256,torch.float8_e4m3fnuz,ck,71,0,123.4161,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,6144,7168,512,torch.float8_e4m3fnuz,ck,71,0,215.2661,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,209.49,440.83,0.0 -80,6144,7168,1024,torch.float8_e4m3fnuz,ck,71,0,327.7242,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,275.21,310.36,0.0 -80,6144,7168,2048,torch.float8_e4m3fnuz,ck,71,0,509.0789,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,6144,7168,2304,torch.float8_e4m3fnuz,ck,71,0,560.4929,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,6144,7168,4096,torch.float8_e4m3fnuz,ck,71,0,996.6903,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,361.98,143.08,0.0 -80,6144,7168,4608,torch.float8_e4m3fnuz,ck,71,0,1121.7892,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,361.81,133.2,0.0 -80,6144,7168,8192,torch.float8_e4m3fnuz,ck,71,0,1890.6064,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.65,104.27,0.0 -80,6144,7168,9216,torch.float8_e4m3fnuz,ck,71,0,2131.2878,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,380.87,98.89,0.0 -80,6144,7168,16384,torch.float8_e4m3fnuz,ck,85,0,3671.969,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,6144,7168,18432,torch.float8_e4m3fnuz,ck,85,0,4139.4899,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,7168,256,torch.float8_e4m3fnuz,ck,71,0,158.8437,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1892.73,764.1,0.0 -80,8192,7168,512,torch.float8_e4m3fnuz,ck,71,0,284.9791,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,211.0,439.7,0.0 -80,8192,7168,1024,torch.float8_e4m3fnuz,ck,71,0,434.723,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,276.63,306.33,0.0 -80,8192,7168,2048,torch.float8_e4m3fnuz,ck,71,0,665.8308,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3612.3,223.63,0.0 -80,8192,7168,2304,torch.float8_e4m3fnuz,ck,71,0,736.4592,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3674.11,207.52,0.0 -80,8192,7168,4096,torch.float8_e4m3fnuz,ck,71,0,1320.5151,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,364.28,136.58,0.0 -80,8192,7168,4608,torch.float8_e4m3fnuz,ck,71,0,1479.7426,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.72,127.2,0.0 -80,8192,7168,8192,torch.float8_e4m3fnuz,ck,71,0,2510.9482,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,383.15,96.88,0.0 -80,8192,7168,9216,torch.float8_e4m3fnuz,ck,71,0,2849.736,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.8,90.89,0.0 -80,8192,7168,16384,torch.float8_e4m3fnuz,ck,85,0,4942.7227,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,7168,18432,torch.float8_e4m3fnuz,ck,85,0,5501.8021,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,10240,7168,256,torch.float8_e4m3fnuz,ck,71,0,199.187,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,10240,7168,512,torch.float8_e4m3fnuz,ck,71,0,352.3613,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,213.31,441.91,0.0 -80,10240,7168,1024,torch.float8_e4m3fnuz,ck,71,0,534.3848,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,281.3,308.07,0.0 -80,10240,7168,2048,torch.float8_e4m3fnuz,ck,71,0,843.0746,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,10240,7168,2304,torch.float8_e4m3fnuz,ck,71,0,913.7132,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,10240,7168,4096,torch.float8_e4m3fnuz,ck,71,0,1650.4804,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,364.32,132.15,0.0 -80,10240,7168,4608,torch.float8_e4m3fnuz,ck,71,0,1837.7562,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,368.09,123.53,0.0 -80,10240,7168,8192,torch.float8_e4m3fnuz,ck,85,0,3170.8672,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.26,91.27,0.0 -80,10240,7168,9216,torch.float8_e4m3fnuz,ck,85,0,3555.6005,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,380.5,86.41,0.0 -80,10240,7168,16384,torch.float8_e4m3fnuz,ck,85,0,6072.4257,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,10240,7168,18432,torch.float8_e4m3fnuz,ck,85,0,6834.6666,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,7168,256,torch.float8_e4m3fnuz,ck,71,0,243.398,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,7168,512,torch.float8_e4m3fnuz,ck,71,0,420.7005,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,214.39,442.41,0.0 -80,12288,7168,1024,torch.float8_e4m3fnuz,ck,71,0,642.9593,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,280.56,304.97,0.0 -80,12288,7168,2048,torch.float8_e4m3fnuz,ck,71,0,1001.5776,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,7168,2304,torch.float8_e4m3fnuz,ck,72,0,1134.7441,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,12288,7168,4096,torch.float8_e4m3fnuz,ck,71,0,1986.0405,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,363.31,128.83,0.0 -80,12288,7168,4608,torch.float8_e4m3fnuz,ck,102,0,2231.5819,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,363.75,119.11,0.0 -80,12288,7168,8192,torch.float8_e4m3fnuz,ck,102,0,3769.9628,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,382.79,89.0,0.0 -80,12288,7168,9216,torch.float8_e4m3fnuz,ck,102,0,4210.5442,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,385.58,84.42,0.0 -80,12288,7168,16384,torch.float8_e4m3fnuz,ck,102,0,7272.8109,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,7168,18432,torch.float8_e4m3fnuz,ck,102,0,8054.046,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,7168,256,torch.float8_e4m3fnuz,ck,72,0,297.7928,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,14336,7168,512,torch.float8_e4m3fnuz,ck,71,0,486.4844,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,216.3,445.09,0.0 -80,14336,7168,1024,torch.float8_e4m3fnuz,ck,71,0,743.1281,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,283.2,306.19,0.0 -80,14336,7168,2048,torch.float8_e4m3fnuz,ck,74,0,1199.9511,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,14336,7168,2304,torch.float8_e4m3fnuz,ck,71,0,1310.3925,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,7168,4096,torch.float8_e4m3fnuz,ck,71,0,2319.2645,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,362.97,126.59,0.0 -80,14336,7168,4608,torch.float8_e4m3fnuz,ck,71,0,2586.1715,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,366.19,117.78,0.0 -80,14336,7168,8192,torch.float8_e4m3fnuz,ck,85,0,4407.5338,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.99,86.6,0.0 -80,14336,7168,9216,torch.float8_e4m3fnuz,ck,85,0,4962.8544,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.65,81.34,0.0 -80,14336,7168,16384,torch.float8_e4m3fnuz,ck,85,0,8455.8581,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,7168,18432,torch.float8_e4m3fnuz,ck,85,0,9585.3754,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,7168,256,torch.float8_e4m3fnuz,ck,71,0,306.2344,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1963.51,786.69,0.0 -80,16384,7168,512,torch.float8_e4m3fnuz,ck,71,0,558.1976,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,215.44,442.39,0.0 -80,16384,7168,1024,torch.float8_e4m3fnuz,ck,71,0,852.9241,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,281.99,303.66,0.0 -80,16384,7168,2048,torch.float8_e4m3fnuz,ck,71,0,1322.9659,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3636.04,214.0,0.0 -80,16384,7168,2304,torch.float8_e4m3fnuz,ck,71,0,1464.8949,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3694.23,197.38,0.0 -80,16384,7168,4096,torch.float8_e4m3fnuz,ck,71,0,2629.6553,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.86,126.01,0.0 -80,16384,7168,4608,torch.float8_e4m3fnuz,ck,71,0,2948.8372,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,367.04,116.46,0.0 -80,16384,7168,8192,torch.float8_e4m3fnuz,ck,71,0,5047.6593,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.2,84.76,0.0 -80,16384,7168,9216,torch.float8_e4m3fnuz,ck,85,0,5684.8825,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,380.78,79.5,0.0 -80,16384,7168,16384,torch.float8_e4m3fnuz,ck,85,0,9679.208,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,7168,18432,torch.float8_e4m3fnuz,ck,85,0,10922.6859,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,20480,7168,256,torch.float8_e4m3fnuz,ck,71,0,381.5807,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1969.75,787.98,0.0 -80,20480,7168,2048,torch.float8_e4m3fnuz,ck,71,0,1639.9077,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3666.64,213.56,0.0 -80,20480,7168,2304,torch.float8_e4m3fnuz,ck,71,0,1805.4245,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3746.8,197.9,0.0 -80,32768,7168,256,torch.float8_e4m3fnuz,ck,71,0,600.9077,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2001.29,798.77,0.0 -80,32768,7168,512,torch.float8_e4m3fnuz,ck,71,0,1096.744,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,219.3,446.97,0.0 -80,32768,7168,1024,torch.float8_e4m3fnuz,ck,71,0,1682.4154,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,285.92,303.53,0.0 -80,32768,7168,2048,torch.float8_e4m3fnuz,ck,72,0,2704.1764,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,32768,7168,2304,torch.float8_e4m3fnuz,ck,71,0,2907.7206,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3722.27,193.2,0.0 -80,32768,7168,4096,torch.float8_e4m3fnuz,ck,71,0,5253.8807,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,366.23,120.55,0.0 -80,32768,7168,4608,torch.float8_e4m3fnuz,ck,71,0,5865.1077,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,369.07,111.47,0.0 -80,32768,7168,8192,torch.float8_e4m3fnuz,ck,71,0,10111.3215,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,380.59,78.81,0.0 -80,32768,7168,9216,torch.float8_e4m3fnuz,ck,85,0,11337.9808,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.84,73.89,0.0 -80,32768,7168,16384,torch.float8_e4m3fnuz,ck,85,0,19294.1288,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,32768,7168,18432,torch.float8_e4m3fnuz,ck,85,0,21856.5731,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,7168,256,torch.float8_e4m3fnuz,ck,71,0,1212.7558,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,7168,512,torch.float8_e4m3fnuz,ck,71,0,2167.8734,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,221.89,450.56,0.0 -80,65536,7168,1024,torch.float8_e4m3fnuz,ck,71,0,3378.6011,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,284.75,300.12,0.0 -80,65536,7168,2048,torch.float8_e4m3fnuz,ck,71,0,5287.9696,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,7168,2304,torch.float8_e4m3fnuz,ck,71,0,5871.5636,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,7168,4096,torch.float8_e4m3fnuz,ck,71,0,10540.0481,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.11,117.39,0.0 -80,65536,7168,4608,torch.float8_e4m3fnuz,ck,71,0,11720.6642,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,369.38,108.74,0.0 -80,65536,7168,8192,torch.float8_e4m3fnuz,ck,85,0,20110.6143,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,382.71,76.33,0.0 -80,65536,7168,9216,torch.float8_e4m3fnuz,ck,85,0,22625.9524,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,382.69,71.14,0.0 -80,65536,7168,16384,torch.float8_e4m3fnuz,ck,85,0,38748.1812,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,7168,18432,torch.float8_e4m3fnuz,ck,85,0,43467.0784,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,7168,256,torch.float8_e4m3fnuz,ck,72,0,1997.5301,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,98304,7168,512,torch.float8_e4m3fnuz,ck,71,0,3241.0898,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,222.63,451.48,0.0 -80,98304,7168,1024,torch.float8_e4m3fnuz,ck,71,0,5048.0903,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,285.87,300.57,0.0 -80,98304,7168,2048,torch.float8_e4m3fnuz,ck,72,0,8086.5215,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,98304,7168,2304,torch.float8_e4m3fnuz,ck,71,0,8879.3992,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,7168,4096,torch.float8_e4m3fnuz,ck,71,0,15801.8859,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.3,116.52,0.0 -80,98304,7168,4608,torch.float8_e4m3fnuz,ck,71,0,17582.8246,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,369.34,107.79,0.0 -80,98304,7168,8192,torch.float8_e4m3fnuz,ck,102,0,29902.9106,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,386.08,76.02,0.0 -80,98304,7168,9216,torch.float8_e4m3fnuz,ck,102,0,33489.9388,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,387.82,71.11,0.0 -80,98304,7168,16384,torch.float8_e4m3fnuz,ck,102,0,57025.1637,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,7168,18432,torch.float8_e4m3fnuz,ck,102,0,63990.1906,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,7168,256,torch.float8_e4m3fnuz,ck,71,0,2444.3347,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,7168,2048,torch.float8_e4m3fnuz,ck,71,0,10519.7293,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,7168,2304,torch.float8_e4m3fnuz,ck,71,0,11621.1373,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,7168,8192,torch.float8_e4m3fnuz,ck,85,0,40106.8483,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,383.8,75.09,0.0 -80,131072,7168,16384,torch.float8_e4m3fnuz,ck,85,0,77358.0635,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,7168,18432,torch.float8_e4m3fnuz,ck,85,0,87126.4572,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1,8192,512,torch.float8_e4m3fnuz,ck,9,0,5.7689,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.45,729.98,0.0 -80,1,8192,1024,torch.float8_e4m3fnuz,ck,15,0,6.4482,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,26.02,1303.62,0.0 -80,1,8192,1536,torch.float8_e4m3fnuz,ck,11,0,7.6538,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,2,8192,512,torch.float8_e4m3fnuz,ck,9,0,5.8077,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.89,728.02,0.0 -80,2,8192,1536,torch.float8_e4m3fnuz,ck,11,0,7.5144,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,6.7,1679.28,0.0 -80,4,8192,512,torch.float8_e4m3fnuz,ck,9,0,5.8024,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.78,734.5,0.0 -80,4,8192,1536,torch.float8_e4m3fnuz,ck,6,0,7.6354,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,13.18,1657.36,0.0 -80,8,8192,512,torch.float8_e4m3fnuz,ck,9,0,5.8225,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,11.53,743.58,0.0 -80,8,8192,1536,torch.float8_e4m3fnuz,ck,108,0,7.9074,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,25.46,1609.41,0.0 -80,16,8192,512,torch.float8_e4m3fnuz,ck,9,0,5.7209,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,23.46,780.41,0.0 -80,16,8192,1536,torch.float8_e4m3fnuz,ck,108,0,7.8973,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,50.99,1629.62,0.0 -80,32,8192,512,torch.float8_e4m3fnuz,ck,76,0,6.5249,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,41.14,725.68,0.0 -80,32,8192,1024,torch.float8_e4m3fnuz,ck,119,0,7.4863,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,717.14,1194.94,0.0 -80,32,8192,1536,torch.float8_e4m3fnuz,ck,112,0,9.686,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,83.14,1358.29,0.0 -80,64,8192,512,torch.float8_e4m3fnuz,ck,76,0,8.7017,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,61.7,606.28,0.0 -80,64,8192,1024,torch.float8_e4m3fnuz,ck,114,0,10.9842,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,977.53,865.13,0.0 -80,64,8192,1536,torch.float8_e4m3fnuz,ck,114,0,12.7978,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,96,8192,512,torch.float8_e4m3fnuz,ck,76,0,9.814,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,82.06,592.66,0.0 -80,96,8192,1536,torch.float8_e4m3fnuz,ck,120,0,16.253,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,128,8192,512,torch.float8_e4m3fnuz,ck,76,0,11.8488,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,90.62,536.51,0.0 -80,128,8192,1024,torch.float8_e4m3fnuz,ck,121,0,14.9703,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1434.5,709.19,0.0 -80,128,8192,1536,torch.float8_e4m3fnuz,ck,121,0,19.1559,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,160,8192,512,torch.float8_e4m3fnuz,ck,84,0,12.6363,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,106.22,545.86,0.0 -80,160,8192,1536,torch.float8_e4m3fnuz,ck,100,0,21.719,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,192,8192,512,torch.float8_e4m3fnuz,ck,84,0,14.7407,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,109.26,504.61,0.0 -80,192,8192,1024,torch.float8_e4m3fnuz,ck,85,0,18.7963,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1713.76,624.11,0.0 -80,192,8192,1536,torch.float8_e4m3fnuz,ck,86,0,24.7435,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,224,8192,512,torch.float8_e4m3fnuz,ck,76,0,16.6705,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,112.72,478.63,0.0 -80,224,8192,1536,torch.float8_e4m3fnuz,ck,100,0,28.8459,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,256,8192,512,torch.float8_e4m3fnuz,ck,76,0,18.1586,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,118.26,469.18,0.0 -80,256,8192,1024,torch.float8_e4m3fnuz,ck,70,0,23.9267,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1795.05,536.85,0.0 -80,256,8192,1536,torch.float8_e4m3fnuz,ck,85,0,30.3059,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,288,8192,512,torch.float8_e4m3fnuz,ck,85,0,19.4782,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,124.03,465.15,0.0 -80,288,8192,1536,torch.float8_e4m3fnuz,ck,85,0,32.0963,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,320,8192,512,torch.float8_e4m3fnuz,ck,85,0,19.6127,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,136.87,489.53,0.0 -80,320,8192,1024,torch.float8_e4m3fnuz,ck,85,0,24.7583,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2168.45,563.82,0.0 -80,320,8192,1536,torch.float8_e4m3fnuz,ck,101,0,32.6299,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,352,8192,512,torch.float8_e4m3fnuz,ck,84,0,22.5512,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,130.94,449.72,0.0 -80,352,8192,1536,torch.float8_e4m3fnuz,ck,85,0,38.7583,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,384,8192,512,torch.float8_e4m3fnuz,ck,84,0,23.7469,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,135.65,449.84,0.0 -80,384,8192,1536,torch.float8_e4m3fnuz,ck,85,0,38.4623,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,8192,512,torch.float8_e4m3fnuz,ck,84,0,25.8591,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,134.95,434.01,0.0 -80,416,8192,1536,torch.float8_e4m3fnuz,ck,72,0,43.8675,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,448,8192,512,torch.float8_e4m3fnuz,ck,72,0,26.0849,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,144.07,450.98,0.0 -80,448,8192,1536,torch.float8_e4m3fnuz,ck,72,0,43.8247,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,480,8192,512,torch.float8_e4m3fnuz,ck,102,0,28.9604,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,139.04,424.87,0.0 -80,480,8192,1536,torch.float8_e4m3fnuz,ck,102,0,43.0079,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,8192,512,torch.float8_e4m3fnuz,ck,85,0,29.9571,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,143.37,428.78,0.0 -80,512,8192,1024,torch.float8_e4m3fnuz,ck,85,0,37.882,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2267.55,456.72,0.0 -80,512,8192,1536,torch.float8_e4m3fnuz,ck,85,0,50.0471,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1024,8192,512,torch.float8_e4m3fnuz,ck,85,0,50.608,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,169.73,424.75,0.0 -80,1024,8192,1024,torch.float8_e4m3fnuz,ck,71,0,67.0889,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2560.76,390.74,0.0 -80,1024,8192,1536,torch.float8_e4m3fnuz,ck,71,0,90.3089,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,2048,8192,512,torch.float8_e4m3fnuz,ck,71,0,91.3476,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,188.07,424.72,0.0 -80,2048,8192,1024,torch.float8_e4m3fnuz,ck,71,0,120.4715,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2852.11,365.57,0.0 -80,2048,8192,1536,torch.float8_e4m3fnuz,ck,71,0,160.9323,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,4096,8192,512,torch.float8_e4m3fnuz,ck,71,0,171.3964,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,200.47,428.25,0.0 -80,4096,8192,1024,torch.float8_e4m3fnuz,ck,71,0,227.2969,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3023.34,350.61,0.0 -80,4096,8192,1536,torch.float8_e4m3fnuz,ck,71,0,307.3715,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,6144,8192,512,torch.float8_e4m3fnuz,ck,71,0,246.4712,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,209.11,438.2,0.0 -80,6144,8192,1536,torch.float8_e4m3fnuz,ck,71,0,455.0275,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,8192,512,torch.float8_e4m3fnuz,ck,71,0,326.6613,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,210.37,436.56,0.0 -80,8192,8192,1024,torch.float8_e4m3fnuz,ck,71,0,434.801,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3160.96,347.27,0.0 -80,8192,8192,1536,torch.float8_e4m3fnuz,ck,72,0,621.3308,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,10240,8192,512,torch.float8_e4m3fnuz,ck,71,0,399.9462,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,214.78,443.08,0.0 -80,10240,8192,1536,torch.float8_e4m3fnuz,ck,71,0,754.6592,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,8192,512,torch.float8_e4m3fnuz,ck,71,0,479.0689,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,215.17,442.13,0.0 -80,12288,8192,1536,torch.float8_e4m3fnuz,ck,102,0,912.1361,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,8192,512,torch.float8_e4m3fnuz,ck,71,0,553.8975,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,217.11,444.88,0.0 -80,14336,8192,1536,torch.float8_e4m3fnuz,ck,71,0,1039.5469,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,8192,512,torch.float8_e4m3fnuz,ck,71,0,634.9591,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,216.45,442.58,0.0 -80,16384,8192,1024,torch.float8_e4m3fnuz,ck,71,0,849.5169,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3235.7,345.61,0.0 -80,16384,8192,1536,torch.float8_e4m3fnuz,ck,71,0,1182.4869,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,32768,8192,512,torch.float8_e4m3fnuz,ck,71,0,1248.4259,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,220.18,446.84,0.0 -80,32768,8192,1536,torch.float8_e4m3fnuz,ck,71,0,2344.2068,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,8192,512,torch.float8_e4m3fnuz,ck,71,0,2486.4287,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,221.1,447.02,0.0 -80,65536,8192,1536,torch.float8_e4m3fnuz,ck,71,0,4684.7126,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,8192,512,torch.float8_e4m3fnuz,ck,71,0,3733.0179,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,220.9,446.06,0.0 -80,98304,8192,1536,torch.float8_e4m3fnuz,ck,71,0,7034.6763,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,8192,1536,torch.float8_e4m3fnuz,ck,71,0,9394.7156,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1,9216,4096,torch.float8_e4m3fnuz,ck,11,0,13.8467,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,5.45,2727.82,0.0 -80,1,9216,7168,torch.float8_e4m3fnuz,ck,5,0,20.4394,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,6.46,3233.26,0.0 -80,2,9216,4096,torch.float8_e4m3fnuz,ck,11,0,13.2579,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,11.39,2850.66,0.0 -80,2,9216,7168,torch.float8_e4m3fnuz,ck,5,0,20.6094,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,12.82,3207.83,0.0 -80,4,9216,4096,torch.float8_e4m3fnuz,ck,5,0,13.3531,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,22.62,2833.71,0.0 -80,4,9216,7168,torch.float8_e4m3fnuz,ck,6,0,20.7042,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,25.53,3195.62,0.0 -80,8,9216,4096,torch.float8_e4m3fnuz,ck,11,0,14.0931,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,42.86,2691.31,0.0 -80,8,9216,7168,torch.float8_e4m3fnuz,ck,6,0,20.7785,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,50.87,3189.12,0.0 -80,16,9216,4096,torch.float8_e4m3fnuz,ck,5,0,14.7532,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,81.88,2583.11,0.0 -80,16,9216,7168,torch.float8_e4m3fnuz,ck,6,0,21.4925,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,98.36,3092.7,0.0 -80,32,9216,4096,torch.float8_e4m3fnuz,ck,119,0,18.4783,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1307.44,2081.88,0.0 -80,32,9216,7168,torch.float8_e4m3fnuz,ck,119,0,26.6078,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,158.9,2513.53,0.0 -80,64,9216,4096,torch.float8_e4m3fnuz,ck,114,0,25.0703,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1927.32,1563.23,0.0 -80,64,9216,7168,torch.float8_e4m3fnuz,ck,121,0,39.8294,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,212.3,1699.72,0.0 -80,96,9216,7168,torch.float8_e4m3fnuz,ck,123,0,50.9032,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,249.17,1346.04,0.0 -80,128,9216,4096,torch.float8_e4m3fnuz,ck,121,0,39.1328,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2469.46,1038.32,0.0 -80,128,9216,7168,torch.float8_e4m3fnuz,ck,121,0,64.3726,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,262.71,1077.12,0.0 -80,160,9216,7168,torch.float8_e4m3fnuz,ck,136,0,75.0573,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,281.64,934.7,0.0 -80,192,9216,7168,torch.float8_e4m3fnuz,ck,86,0,85.7848,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,295.71,827.37,0.0 -80,224,9216,7168,torch.float8_e4m3fnuz,ck,138,0,101.9633,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,290.25,704.12,0.0 -80,256,9216,4096,torch.float8_e4m3fnuz,ck,70,0,64.6741,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2988.42,672.85,0.0 -80,256,9216,7168,torch.float8_e4m3fnuz,ck,0,0,109.2884,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,309.48,664.42,0.0 -80,288,9216,7168,torch.float8_e4m3fnuz,ck,94,0,120.3302,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,316.22,610.26,0.0 -80,320,9216,7168,torch.float8_e4m3fnuz,ck,93,0,125.6795,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,336.4,590.81,0.0 -80,352,9216,7168,torch.float8_e4m3fnuz,ck,102,0,159.5041,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,291.57,470.66,0.0 -80,384,9216,7168,torch.float8_e4m3fnuz,ck,102,0,155.4479,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,326.37,488.21,0.0 -80,416,9216,7168,torch.float8_e4m3fnuz,ck,85,0,192.6714,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,285.26,398.14,0.0 -80,448,9216,7168,torch.float8_e4m3fnuz,ck,85,0,189.2582,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,312.75,409.65,0.0 -80,480,9216,7168,torch.float8_e4m3fnuz,ck,94,0,183.3908,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,345.81,427.22,0.0 -80,512,9216,7168,torch.float8_e4m3fnuz,ck,93,0,197.136,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,343.14,401.59,0.0 -80,1024,9216,4096,torch.float8_e4m3fnuz,ck,93,0,219.0276,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3529.67,277.67,0.0 -80,1024,9216,7168,torch.float8_e4m3fnuz,ck,93,0,383.3589,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,352.91,240.7,0.0 -80,2048,9216,4096,torch.float8_e4m3fnuz,ck,68,0,425.2314,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3636.11,197.27,0.0 -80,2048,9216,7168,torch.float8_e4m3fnuz,ck,85,0,734.0056,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,368.64,161.43,0.0 -80,4096,9216,4096,torch.float8_e4m3fnuz,ck,71,0,812.5832,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3805.61,160.01,0.0 -80,4096,9216,7168,torch.float8_e4m3fnuz,ck,71,0,1429.0113,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,378.7,119.61,0.0 -80,4240,9216,4096,torch.float8_e4m3fnuz,ck,93,0,903.6056,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3542.58,147.48,0.0 -80,6144,9216,7168,torch.float8_e4m3fnuz,ck,102,0,2153.3558,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,376.97,103.72,0.0 -80,8192,9216,7168,torch.float8_e4m3fnuz,ck,71,0,2870.3304,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.08,96.08,0.0 -80,10240,9216,7168,torch.float8_e4m3fnuz,ck,93,0,3582.8927,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.6,91.6,0.0 -80,12288,9216,7168,torch.float8_e4m3fnuz,ck,102,0,4297.9231,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.74,88.56,0.0 -80,14336,9216,7168,torch.float8_e4m3fnuz,ck,93,0,5014.6828,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.71,86.36,0.0 -80,16384,9216,4096,torch.float8_e4m3fnuz,ck,71,0,3218.0651,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3843.77,126.43,0.0 -80,16384,9216,7168,torch.float8_e4m3fnuz,ck,93,0,5708.7709,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.18,85.04,0.0 -80,32768,9216,4096,torch.float8_e4m3fnuz,ck,68,0,6437.3643,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3843.03,120.54,0.0 -80,32768,9216,7168,torch.float8_e4m3fnuz,ck,85,0,11449.253,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,378.13,79.04,0.0 -80,65536,9216,7168,torch.float8_e4m3fnuz,ck,85,0,22829.8274,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.27,76.38,0.0 -80,98304,9216,7168,torch.float8_e4m3fnuz,ck,102,0,34028.5473,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.68,75.9,0.0 -80,1,11264,1536,torch.float8_e4m3fnuz,ck,108,0,9.3388,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,37.05,1855.22,0.0 -80,2,11264,1536,torch.float8_e4m3fnuz,ck,15,0,9.4898,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,11264,1536,torch.float8_e4m3fnuz,ck,10,0,9.667,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,11264,1536,torch.float8_e4m3fnuz,ck,10,0,10.0366,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,16,11264,1536,torch.float8_e4m3fnuz,ck,10,0,9.7876,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,565.66,1807.03,0.0 -80,32,11264,1536,torch.float8_e4m3fnuz,ck,76,0,12.5451,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,882.65,1440.53,0.0 -80,48,11264,1536,torch.float8_e4m3fnuz,ck,113,0,15.2534,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1088.9,1210.0,0.0 -80,64,11264,1536,torch.float8_e4m3fnuz,ck,112,0,17.4734,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1267.41,1078.3,0.0 -80,80,11264,1536,torch.float8_e4m3fnuz,ck,100,0,21.6601,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1278.04,887.65,0.0 -80,96,11264,1536,torch.float8_e4m3fnuz,ck,119,0,21.7177,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1529.58,903.03,0.0 -80,112,11264,1536,torch.float8_e4m3fnuz,ck,85,0,25.6868,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1508.77,778.48,0.0 -80,128,11264,1536,torch.float8_e4m3fnuz,ck,85,0,26.4316,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1675.72,771.11,0.0 -80,160,11264,1536,torch.float8_e4m3fnuz,ck,100,0,30.1239,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,192,11264,1536,torch.float8_e4m3fnuz,ck,85,0,31.3634,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2118.32,698.96,0.0 -80,224,11264,1536,torch.float8_e4m3fnuz,ck,100,0,37.6656,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,256,11264,1536,torch.float8_e4m3fnuz,ck,85,0,41.1,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2155.32,570.85,0.0 -80,288,11264,1536,torch.float8_e4m3fnuz,ck,102,0,42.7952,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,320,11264,1536,torch.float8_e4m3fnuz,ck,85,0,45.0808,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,352,11264,1536,torch.float8_e4m3fnuz,ck,85,0,51.422,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,384,11264,1536,torch.float8_e4m3fnuz,ck,85,0,51.3588,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,11264,1536,torch.float8_e4m3fnuz,ck,85,0,58.2264,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,448,11264,1536,torch.float8_e4m3fnuz,ck,85,0,57.7888,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,11264,1536,torch.float8_e4m3fnuz,ck,102,0,64.1117,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,11264,1536,torch.float8_e4m3fnuz,ck,85,0,63.9678,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2769.63,463.08,0.0 -80,1024,11264,1536,torch.float8_e4m3fnuz,ck,85,0,116.8538,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3032.29,358.94,0.0 -80,1536,11264,1536,torch.float8_e4m3fnuz,ck,85,0,168.9501,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3145.91,321.18,0.0 -80,2048,11264,1536,torch.float8_e4m3fnuz,ck,71,0,217.6677,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3255.74,305.9,0.0 -80,4096,11264,1536,torch.float8_e4m3fnuz,ck,71,0,417.0421,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3398.55,277.83,0.0 -80,6144,11264,1536,torch.float8_e4m3fnuz,ck,71,0,614.8496,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,11264,1536,torch.float8_e4m3fnuz,ck,71,0,818.2792,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3464.19,262.05,0.0 -80,10240,11264,1536,torch.float8_e4m3fnuz,ck,71,0,1039.4493,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,11264,1536,torch.float8_e4m3fnuz,ck,71,0,1221.6133,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,11264,1536,torch.float8_e4m3fnuz,ck,71,0,1422.2274,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,11264,1536,torch.float8_e4m3fnuz,ck,72,0,1650.9527,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,3433.99,249.29,0.0 -80,32768,11264,1536,torch.float8_e4m3fnuz,ck,72,0,3296.9348,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,3439.17,244.42,0.0 -80,49152,11264,1536,torch.float8_e4m3fnuz,ck,71,0,4796.016,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3546.29,250.23,0.0 -80,65536,11264,1536,torch.float8_e4m3fnuz,ck,71,0,6394.7949,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3546.23,249.32,0.0 -80,73728,11264,1536,torch.float8_e4m3fnuz,ck,71,0,7197.2388,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3544.71,248.91,0.0 -80,98304,11264,1536,torch.float8_e4m3fnuz,ck,71,0,9673.3902,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,11264,1536,torch.float8_e4m3fnuz,ck,71,0,12851.8716,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3529.05,246.77,0.0 -80,1,12288,1536,torch.float8_e4m3fnuz,ck,15,0,8.8721,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,4.25,2130.33,0.0 -80,2,12288,1536,torch.float8_e4m3fnuz,ck,15,0,9.1519,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,8.25,2068.05,0.0 -80,4,12288,1536,torch.float8_e4m3fnuz,ck,108,0,9.0818,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,16.63,2089.76,0.0 -80,8,12288,1536,torch.float8_e4m3fnuz,ck,108,0,9.4892,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,31.82,2011.05,0.0 -80,16,12288,1536,torch.float8_e4m3fnuz,ck,10,0,10.1361,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,59.59,1903.31,0.0 -80,32,12288,1536,torch.float8_e4m3fnuz,ck,112,0,12.5215,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,96.47,1574.09,0.0 -80,64,12288,1536,torch.float8_e4m3fnuz,ck,112,0,17.7365,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,136.21,1158.38,0.0 -80,96,12288,1536,torch.float8_e4m3fnuz,ck,133,0,22.9216,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,158.1,932.79,0.0 -80,128,12288,1536,torch.float8_e4m3fnuz,ck,119,0,27.07,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,178.49,820.71,0.0 -80,160,12288,1536,torch.float8_e4m3fnuz,ck,119,0,31.5739,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,191.29,730.11,0.0 -80,192,12288,1536,torch.float8_e4m3fnuz,ck,85,0,34.1912,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,211.98,698.66,0.0 -80,224,12288,1536,torch.float8_e4m3fnuz,ck,85,0,42.2156,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,200.3,585.65,0.0 -80,256,12288,1536,torch.float8_e4m3fnuz,ck,85,0,43.4262,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,222.53,588.56,0.0 -80,288,12288,1536,torch.float8_e4m3fnuz,ck,102,0,48.4492,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,224.39,544.79,0.0 -80,320,12288,1536,torch.float8_e4m3fnuz,ck,93,0,51.1407,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,236.2,532.46,0.0 -80,352,12288,1536,torch.float8_e4m3fnuz,ck,93,0,60.7878,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,218.59,461.7,0.0 -80,384,12288,1536,torch.float8_e4m3fnuz,ck,71,0,59.7854,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,242.46,483.42,0.0 -80,416,12288,1536,torch.float8_e4m3fnuz,ck,85,0,69.8237,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,224.9,425.89,0.0 -80,448,12288,1536,torch.float8_e4m3fnuz,ck,85,0,69.2979,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,244.04,441.18,0.0 -80,480,12288,1536,torch.float8_e4m3fnuz,ck,94,0,71.9873,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,251.7,436.3,0.0 -80,512,12288,1536,torch.float8_e4m3fnuz,ck,85,0,75.8898,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,254.68,424.88,0.0 -80,1024,12288,1536,torch.float8_e4m3fnuz,ck,71,0,138.0326,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,280.04,330.45,0.0 -80,2048,12288,1536,torch.float8_e4m3fnuz,ck,71,0,263.7876,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,293.07,274.28,0.0 -80,4096,12288,1536,torch.float8_e4m3fnuz,ck,71,0,499.9388,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,309.28,251.69,0.0 -80,6144,12288,1536,torch.float8_e4m3fnuz,ck,71,0,739.7506,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,313.52,242.39,0.0 -80,8192,12288,1536,torch.float8_e4m3fnuz,ck,71,0,970.833,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,318.53,239.78,0.0 -80,10240,12288,1536,torch.float8_e4m3fnuz,ck,71,0,1218.8252,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,317.15,234.87,0.0 -80,12288,12288,1536,torch.float8_e4m3fnuz,ck,71,0,1456.9262,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,318.38,233.19,0.0 -80,14336,12288,1536,torch.float8_e4m3fnuz,ck,71,0,1687.663,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.66,232.99,0.0 -80,16384,12288,1536,torch.float8_e4m3fnuz,ck,71,0,1931.9974,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.12,231.21,0.0 -80,32768,12288,1536,torch.float8_e4m3fnuz,ck,71,0,3848.4946,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.41,227.23,0.0 -80,65536,12288,1536,torch.float8_e4m3fnuz,ck,71,0,7676.2381,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.28,225.39,0.0 -80,98304,12288,1536,torch.float8_e4m3fnuz,ck,71,0,11500.3768,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.67,224.84,0.0 -80,131072,12288,1536,torch.float8_e4m3fnuz,ck,71,0,15314.1682,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,323.09,224.72,0.0 -80,1,14336,1536,torch.float8_e4m3fnuz,ck,108,0,9.3708,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,4.7,2353.09,0.0 -80,2,14336,1536,torch.float8_e4m3fnuz,ck,108,0,9.4408,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,9.33,2338.84,0.0 -80,4,14336,1536,torch.float8_e4m3fnuz,ck,15,0,9.4788,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,18.58,2335.84,0.0 -80,8,14336,1536,torch.float8_e4m3fnuz,ck,108,0,9.4769,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,37.18,2349.06,0.0 -80,16,14336,1536,torch.float8_e4m3fnuz,ck,15,0,10.5152,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,67.01,2140.09,0.0 -80,32,14336,1536,torch.float8_e4m3fnuz,ck,112,0,12.7846,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,110.23,1798.0,0.0 -80,64,14336,1536,torch.float8_e4m3fnuz,ck,119,0,18.1404,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,155.38,1320.45,0.0 -80,96,14336,1536,torch.float8_e4m3fnuz,ck,120,0,25.9487,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,162.93,960.36,0.0 -80,128,14336,1536,torch.float8_e4m3fnuz,ck,85,0,27.8617,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,202.33,929.11,0.0 -80,160,14336,1536,torch.float8_e4m3fnuz,ck,119,0,36.4462,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,193.34,736.79,0.0 -80,192,14336,1536,torch.float8_e4m3fnuz,ck,86,0,41.4702,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,203.9,670.84,0.0 -80,224,14336,1536,torch.float8_e4m3fnuz,ck,72,0,47.9425,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,205.77,600.44,0.0 -80,256,14336,1536,torch.float8_e4m3fnuz,ck,72,0,48.7805,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,231.12,609.94,0.0 -80,288,14336,1536,torch.float8_e4m3fnuz,ck,85,0,56.6988,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,223.7,541.81,0.0 -80,320,14336,1536,torch.float8_e4m3fnuz,ck,85,0,57.1533,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,246.58,554.42,0.0 -80,352,14336,1536,torch.float8_e4m3fnuz,ck,85,0,69.3423,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,223.56,470.9,0.0 -80,384,14336,1536,torch.float8_e4m3fnuz,ck,85,0,69.3265,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,243.94,484.95,0.0 -80,416,14336,1536,torch.float8_e4m3fnuz,ck,85,0,77.2118,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,237.28,447.94,0.0 -80,448,14336,1536,torch.float8_e4m3fnuz,ck,85,0,76.7227,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,257.16,463.4,0.0 -80,480,14336,1536,torch.float8_e4m3fnuz,ck,86,0,86.6195,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,244.05,421.61,0.0 -80,512,14336,1536,torch.float8_e4m3fnuz,ck,71,0,87.7988,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,256.82,426.96,0.0 -80,1024,14336,1536,torch.float8_e4m3fnuz,ck,85,0,161.5909,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,279.08,327.7,0.0 -80,2048,14336,1536,torch.float8_e4m3fnuz,ck,71,0,300.1671,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,300.48,279.46,0.0 -80,4096,14336,1536,torch.float8_e4m3fnuz,ck,71,0,574.396,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,314.05,253.75,0.0 -80,6144,14336,1536,torch.float8_e4m3fnuz,ck,71,0,860.7529,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,314.36,241.21,0.0 -80,8192,14336,1536,torch.float8_e4m3fnuz,ck,71,0,1133.1069,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,318.4,237.83,0.0 -80,10240,14336,1536,torch.float8_e4m3fnuz,ck,71,0,1419.1055,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,317.79,233.49,0.0 -80,12288,14336,1536,torch.float8_e4m3fnuz,ck,71,0,1690.3277,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.15,232.63,0.0 -80,14336,14336,1536,torch.float8_e4m3fnuz,ck,71,0,1974.9354,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,319.69,230.43,0.0 -80,16384,14336,1536,torch.float8_e4m3fnuz,ck,71,0,2260.0129,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,319.27,228.74,0.0 -80,32768,14336,1536,torch.float8_e4m3fnuz,ck,71,0,4486.7553,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.64,225.53,0.0 -80,65536,14336,1536,torch.float8_e4m3fnuz,ck,71,0,8920.3186,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,323.56,224.4,0.0 -80,98304,14336,1536,torch.float8_e4m3fnuz,ck,71,0,13408.709,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.87,223.11,0.0 -80,1,16384,512,torch.float8_e4m3fnuz,ck,9,0,6.8396,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.45,1231.34,0.0 -80,2,16384,512,torch.float8_e4m3fnuz,ck,9,0,6.6785,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.02,1266.03,0.0 -80,4,16384,512,torch.float8_e4m3fnuz,ck,9,0,6.9405,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.67,1227.83,0.0 -80,8,16384,512,torch.float8_e4m3fnuz,ck,9,0,6.7881,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,19.77,1275.0,0.0 -80,16,16384,512,torch.float8_e4m3fnuz,ck,9,0,7.1281,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,37.66,1251.54,0.0 -80,32,16384,512,torch.float8_e4m3fnuz,ck,76,0,8.5341,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,62.91,1107.74,0.0 -80,64,16384,512,torch.float8_e4m3fnuz,ck,76,0,11.9905,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,89.55,877.24,0.0 -80,96,16384,512,torch.float8_e4m3fnuz,ck,84,0,15.4745,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,104.08,748.55,0.0 -80,128,16384,512,torch.float8_e4m3fnuz,ck,76,0,18.6149,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,115.36,679.48,0.0 -80,160,16384,512,torch.float8_e4m3fnuz,ck,84,0,21.486,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,124.94,638.25,0.0 -80,192,16384,512,torch.float8_e4m3fnuz,ck,84,0,24.2394,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,132.89,609.68,0.0 -80,224,16384,512,torch.float8_e4m3fnuz,ck,84,0,28.1282,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,133.61,563.25,0.0 -80,256,16384,512,torch.float8_e4m3fnuz,ck,85,0,30.4715,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,140.95,554.89,0.0 -80,288,16384,512,torch.float8_e4m3fnuz,ck,84,0,33.0464,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,146.21,543.88,0.0 -80,320,16384,512,torch.float8_e4m3fnuz,ck,85,0,34.1726,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,157.11,557.12,0.0 -80,352,16384,512,torch.float8_e4m3fnuz,ck,84,0,39.9477,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,147.83,503.24,0.0 -80,384,16384,512,torch.float8_e4m3fnuz,ck,85,0,40.3333,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,159.73,524.83,0.0 -80,416,16384,512,torch.float8_e4m3fnuz,ck,84,0,45.0184,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,155.03,493.87,0.0 -80,448,16384,512,torch.float8_e4m3fnuz,ck,85,0,47.0283,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,159.82,495.4,0.0 -80,480,16384,512,torch.float8_e4m3fnuz,ck,84,0,51.972,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,154.95,468.77,0.0 -80,512,16384,512,torch.float8_e4m3fnuz,ck,85,0,51.7738,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,165.91,491.14,0.0 -80,1024,16384,512,torch.float8_e4m3fnuz,ck,71,0,93.0854,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,184.56,456.22,0.0 -80,2048,16384,512,torch.float8_e4m3fnuz,ck,71,0,172.6344,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,199.03,443.4,0.0 -80,4096,16384,512,torch.float8_e4m3fnuz,ck,71,0,331.3938,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,207.37,436.65,0.0 -80,6144,16384,512,torch.float8_e4m3fnuz,ck,71,0,477.6551,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,215.8,445.64,0.0 -80,8192,16384,512,torch.float8_e4m3fnuz,ck,71,0,637.0735,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,215.73,441.11,0.0 -80,10240,16384,512,torch.float8_e4m3fnuz,ck,71,0,784.9988,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,218.85,444.81,0.0 -80,12288,16384,512,torch.float8_e4m3fnuz,ck,71,0,942.3456,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,218.77,442.87,0.0 -80,14336,16384,512,torch.float8_e4m3fnuz,ck,71,0,1094.9387,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,219.66,443.4,0.0 -80,16384,16384,512,torch.float8_e4m3fnuz,ck,71,0,1252.4903,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,219.47,442.04,0.0 -80,32768,16384,512,torch.float8_e4m3fnuz,ck,71,0,2494.7808,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,220.36,440.48,0.0 -80,65536,16384,512,torch.float8_e4m3fnuz,ck,71,0,4931.7756,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,222.94,443.94,0.0 -80,98304,16384,512,torch.float8_e4m3fnuz,ck,71,0,7420.0505,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,222.27,442.04,0.0 -80,131072,16384,512,torch.float8_e4m3fnuz,ck,9,0,18530.1837,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,118.67,235.86,0.0 -80,1,18432,7168,torch.float8_e4m3fnuz,ck,118,0,37.4132,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,7.06,3532.57,0.0 -80,2,18432,7168,torch.float8_e4m3fnuz,ck,16,0,37.7738,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,13.99,3500.01,0.0 -80,4,18432,7168,torch.float8_e4m3fnuz,ck,109,0,38.1876,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,27.68,3464.39,0.0 -80,8,18432,7168,torch.float8_e4m3fnuz,ck,109,0,39.4871,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,53.53,3354.84,0.0 -80,16,18432,7168,torch.float8_e4m3fnuz,ck,7,0,40.5749,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,104.2,3273.58,0.0 -80,32,18432,7168,torch.float8_e4m3fnuz,ck,133,0,46.9286,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,180.18,2845.38,0.0 -80,64,18432,7168,torch.float8_e4m3fnuz,ck,121,0,68.3333,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,247.48,1974.71,0.0 -80,96,18432,7168,torch.float8_e4m3fnuz,ck,86,0,88.3315,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,287.18,1543.59,0.0 -80,128,18432,7168,torch.float8_e4m3fnuz,ck,0,0,110.5497,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,305.95,1246.11,0.0 -80,160,18432,7168,torch.float8_e4m3fnuz,ck,156,0,141.951,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,297.84,980.38,0.0 -80,192,18432,7168,torch.float8_e4m3fnuz,ck,102,0,155.4518,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,326.37,904.3,0.0 -80,224,18432,7168,torch.float8_e4m3fnuz,ck,40,0,199.1076,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,297.28,713.1,0.0 -80,256,18432,7168,torch.float8_e4m3fnuz,ck,93,0,200.0228,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,338.19,716.88,0.0 -80,288,18432,7168,torch.float8_e4m3fnuz,ck,94,0,236.0343,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.42,613.48,0.0 -80,320,18432,7168,torch.float8_e4m3fnuz,ck,93,0,243.2446,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,347.62,601.09,0.0 -80,352,18432,7168,torch.float8_e4m3fnuz,ck,85,0,301.0235,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,308.99,490.39,0.0 -80,384,18432,7168,torch.float8_e4m3fnuz,ck,94,0,294.9271,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,344.05,505.31,0.0 -80,416,18432,7168,torch.float8_e4m3fnuz,ck,85,0,357.1532,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,307.78,421.21,0.0 -80,448,18432,7168,torch.float8_e4m3fnuz,ck,93,0,351.2371,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,337.04,432.32,0.0 -80,480,18432,7168,torch.float8_e4m3fnuz,ck,94,0,352.3586,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,359.96,434.94,0.0 -80,512,18432,7168,torch.float8_e4m3fnuz,ck,68,0,388.5432,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,348.2,398.06,0.0 -80,1024,18432,7168,torch.float8_e4m3fnuz,ck,85,0,742.4427,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,364.45,238.68,0.0 -80,2048,18432,7168,torch.float8_e4m3fnuz,ck,71,0,1441.3783,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,375.45,154.23,0.0 -80,4096,18432,7168,torch.float8_e4m3fnuz,ck,71,0,2836.9828,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.51,110.14,0.0 -80,6144,18432,7168,torch.float8_e4m3fnuz,ck,102,0,4288.8669,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,378.54,93.88,0.0 -80,8192,18432,7168,torch.float8_e4m3fnuz,ck,68,0,5709.0564,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.16,86.32,0.0 -80,10240,18432,7168,torch.float8_e4m3fnuz,ck,68,0,7143.9568,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,378.76,81.61,0.0 -80,12288,18432,7168,torch.float8_e4m3fnuz,ck,85,0,8555.9295,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.5,78.68,0.0 -80,14336,18432,7168,torch.float8_e4m3fnuz,ck,85,0,9992.1813,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.11,76.4,0.0 -80,16384,18432,7168,torch.float8_e4m3fnuz,ck,93,0,11413.8848,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.3,74.78,0.0 -80,32768,18432,7168,torch.float8_e4m3fnuz,ck,85,0,22842.7493,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.05,68.95,0.0 -80,65536,18432,7168,torch.float8_e4m3fnuz,ck,93,0,45613.8859,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.65,66.16,0.0 -80,98304,18432,7168,torch.float8_e4m3fnuz,ck,102,0,68165.4052,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.07,65.44,0.0 -80,1,20480,1536,torch.float8_e4m3fnuz,ck,16,0,11.0968,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,5.67,2838.64,0.0 -80,2,20480,1536,torch.float8_e4m3fnuz,ck,118,0,11.2695,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,11.17,2798.91,0.0 -80,4,20480,1536,torch.float8_e4m3fnuz,ck,16,0,11.25,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,22.37,2811.31,0.0 -80,8,20480,1536,torch.float8_e4m3fnuz,ck,118,0,11.572,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,43.49,2747.77,0.0 -80,16,20480,1536,torch.float8_e4m3fnuz,ck,109,0,12.6053,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,79.86,2549.5,0.0 -80,32,20480,1536,torch.float8_e4m3fnuz,ck,112,0,16.0124,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,125.73,2049.48,0.0 -80,64,20480,1536,torch.float8_e4m3fnuz,ck,121,0,21.6428,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,186.04,1579.14,0.0 -80,96,20480,1536,torch.float8_e4m3fnuz,ck,86,0,28.314,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,213.31,1255.1,0.0 -80,128,20480,1536,torch.float8_e4m3fnuz,ck,70,0,35.7174,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,225.47,1033.02,0.0 -80,160,20480,1536,torch.float8_e4m3fnuz,ck,156,0,47.001,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,214.17,813.95,0.0 -80,192,20480,1536,torch.float8_e4m3fnuz,ck,102,0,49.299,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,245.03,803.6,0.0 -80,224,20480,1536,torch.float8_e4m3fnuz,ck,71,0,61.5138,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,229.1,666.13,0.0 -80,256,20480,1536,torch.float8_e4m3fnuz,ck,85,0,63.3795,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,254.12,667.98,0.0 -80,288,20480,1536,torch.float8_e4m3fnuz,ck,102,0,74.1212,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,244.46,589.52,0.0 -80,320,20480,1536,torch.float8_e4m3fnuz,ck,85,0,81.5229,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,246.96,552.68,0.0 -80,352,20480,1536,torch.float8_e4m3fnuz,ck,71,0,90.9179,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,243.58,510.53,0.0 -80,384,20480,1536,torch.float8_e4m3fnuz,ck,85,0,90.8301,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,265.98,525.99,0.0 -80,416,20480,1536,torch.float8_e4m3fnuz,ck,85,0,106.3525,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,246.09,462.01,0.0 -80,448,20480,1536,torch.float8_e4m3fnuz,ck,85,0,107.2447,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,262.82,470.84,0.0 -80,480,20480,1536,torch.float8_e4m3fnuz,ck,102,0,116.0509,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,260.22,446.83,0.0 -80,512,20480,1536,torch.float8_e4m3fnuz,ck,71,0,115.4031,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,279.13,461.12,0.0 -80,1024,20480,1536,torch.float8_e4m3fnuz,ck,71,0,217.3761,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,296.37,344.9,0.0 -80,2048,20480,1536,torch.float8_e4m3fnuz,ck,71,0,416.8111,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,309.13,284.28,0.0 -80,4096,20480,1536,torch.float8_e4m3fnuz,ck,71,0,813.4964,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,316.78,252.64,0.0 -80,6144,20480,1536,torch.float8_e4m3fnuz,ck,71,0,1212.4912,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,318.8,241.28,0.0 -80,8192,20480,1536,torch.float8_e4m3fnuz,ck,71,0,1603.169,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.49,236.77,0.0 -80,10240,20480,1536,torch.float8_e4m3fnuz,ck,71,0,2006.1017,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.14,232.6,0.0 -80,12288,20480,1536,torch.float8_e4m3fnuz,ck,71,0,2403.8234,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.61,230.32,0.0 -80,14336,20480,1536,torch.float8_e4m3fnuz,ck,71,0,2816.6224,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.22,227.46,0.0 -80,16384,20480,1536,torch.float8_e4m3fnuz,ck,71,0,3191.8526,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.94,227.99,0.0 -80,32768,20480,1536,torch.float8_e4m3fnuz,ck,71,0,6375.7739,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,323.35,223.34,0.0 -80,65536,20480,1536,torch.float8_e4m3fnuz,ck,71,0,12758.9582,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,323.16,220.74,0.0 -80,98304,20480,1536,torch.float8_e4m3fnuz,ck,71,0,19087.7671,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,324.02,220.51,0.0 -80,1,24576,1536,torch.float8_e4m3fnuz,ck,15,0,13.5093,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,2,24576,1536,torch.float8_e4m3fnuz,ck,108,0,13.6777,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,24576,1536,torch.float8_e4m3fnuz,ck,15,0,13.9333,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,24576,1536,torch.float8_e4m3fnuz,ck,9,0,14.5085,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,16,24576,1536,torch.float8_e4m3fnuz,ck,5,0,15.7929,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,32,24576,1536,torch.float8_e4m3fnuz,ck,112,0,19.3781,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1246.73,2031.71,0.0 -80,64,24576,1536,torch.float8_e4m3fnuz,ck,93,0,27.4074,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1762.97,1495.68,0.0 -80,96,24576,1536,torch.float8_e4m3fnuz,ck,94,0,34.225,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2117.68,1245.14,0.0 -80,128,24576,1536,torch.float8_e4m3fnuz,ck,85,0,42.5762,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2269.74,1039.0,0.0 -80,160,24576,1536,torch.float8_e4m3fnuz,ck,93,0,55.5159,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2175.88,826.05,0.0 -80,192,24576,1536,torch.float8_e4m3fnuz,ck,93,0,55.8139,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2597.12,850.7,0.0 -80,224,24576,1536,torch.float8_e4m3fnuz,ck,85,0,71.3416,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2370.49,688.28,0.0 -80,256,24576,1536,torch.float8_e4m3fnuz,ck,85,0,72.5312,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2664.7,699.35,0.0 -80,288,24576,1536,torch.float8_e4m3fnuz,ck,94,0,79.4368,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2737.18,658.98,0.0 -80,320,24576,1536,torch.float8_e4m3fnuz,ck,93,0,84.1633,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2870.51,641.24,0.0 -80,352,24576,1536,torch.float8_e4m3fnuz,ck,93,0,101.4118,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2620.51,548.17,0.0 -80,384,24576,1536,torch.float8_e4m3fnuz,ck,93,0,99.8625,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2903.09,572.92,0.0 -80,416,24576,1536,torch.float8_e4m3fnuz,ck,102,0,122.5387,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,448,24576,1536,torch.float8_e4m3fnuz,ck,93,0,117.3539,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,24576,1536,torch.float8_e4m3fnuz,ck,102,0,119.4687,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,24576,1536,torch.float8_e4m3fnuz,ck,93,0,126.7363,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1024,24576,1536,torch.float8_e4m3fnuz,ck,68,0,241.8128,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,2048,24576,1536,torch.float8_e4m3fnuz,ck,71,0,459.7917,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,4096,24576,1536,torch.float8_e4m3fnuz,ck,72,0,920.9012,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,6144,24576,1536,torch.float8_e4m3fnuz,ck,71,0,1348.8178,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,24576,1536,torch.float8_e4m3fnuz,ck,71,0,1786.9965,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,10240,24576,1536,torch.float8_e4m3fnuz,ck,71,0,2209.3759,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,24576,1536,torch.float8_e4m3fnuz,ck,71,0,2656.0688,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,24576,1536,torch.float8_e4m3fnuz,ck,71,0,3089.1315,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,24576,1536,torch.float8_e4m3fnuz,ck,71,0,3536.4999,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,32768,24576,1536,torch.float8_e4m3fnuz,ck,71,0,7017.8836,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,24576,1536,torch.float8_e4m3fnuz,ck,0,0,inf,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,24576,1536,torch.float8_e4m3fnuz,ck,0,0,1.0,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,131072,24576,1536,torch.float8_e4m3fnuz,ck,0,0,1.0,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1,32768,512,torch.float8_e4m3fnuz,ck,9,0,9.7066,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,1,32768,1536,torch.float8_e4m3fnuz,ck,15,0,17.5611,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,2,32768,512,torch.float8_e4m3fnuz,ck,9,0,9.513,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,2,32768,1536,torch.float8_e4m3fnuz,ck,108,0,17.4059,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,4,32768,512,torch.float8_e4m3fnuz,ck,23,0,9.1702,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 -80,4,32768,1536,torch.float8_e4m3fnuz,ck,109,0,18.3707,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,8,32768,512,torch.float8_e4m3fnuz,ck,9,0,9.9258,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,8,32768,1536,torch.float8_e4m3fnuz,ck,5,0,18.2531,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,16,32768,512,torch.float8_e4m3fnuz,ck,9,0,10.2342,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,16,32768,1536,torch.float8_e4m3fnuz,ck,5,0,19.2975,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 -80,32,32768,512,torch.float8_e4m3fnuz,ck,76,0,12.0918,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,32,32768,1536,torch.float8_e4m3fnuz,ck,119,0,24.3999,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,64,32768,512,torch.float8_e4m3fnuz,ck,85,0,18.7478,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,64,32768,1536,torch.float8_e4m3fnuz,ck,101,0,34.2699,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,96,32768,512,torch.float8_e4m3fnuz,ck,84,0,23.8023,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,96,32768,1536,torch.float8_e4m3fnuz,ck,102,0,43.5312,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,128,32768,512,torch.float8_e4m3fnuz,ck,85,0,28.3083,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,128,32768,1536,torch.float8_e4m3fnuz,ck,85,0,54.7332,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,160,32768,512,torch.float8_e4m3fnuz,ck,84,0,34.3591,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,160,32768,1536,torch.float8_e4m3fnuz,ck,100,0,73.0357,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,192,32768,512,torch.float8_e4m3fnuz,ck,85,0,37.0759,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,192,32768,1536,torch.float8_e4m3fnuz,ck,85,0,73.3389,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,224,32768,512,torch.float8_e4m3fnuz,ck,84,0,45.0439,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,224,32768,1536,torch.float8_e4m3fnuz,ck,85,0,91.0766,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,256,32768,512,torch.float8_e4m3fnuz,ck,85,0,46.3119,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,256,32768,1536,torch.float8_e4m3fnuz,ck,85,0,92.2438,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,288,32768,512,torch.float8_e4m3fnuz,ck,84,0,53.506,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,288,32768,1536,torch.float8_e4m3fnuz,ck,102,0,101.9306,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,320,32768,512,torch.float8_e4m3fnuz,ck,85,0,55.0543,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,320,32768,1536,torch.float8_e4m3fnuz,ck,85,0,114.3911,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,352,32768,512,torch.float8_e4m3fnuz,ck,72,0,64.3624,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,352,32768,1536,torch.float8_e4m3fnuz,ck,71,0,132.552,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,384,32768,512,torch.float8_e4m3fnuz,ck,85,0,63.438,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,384,32768,1536,torch.float8_e4m3fnuz,ck,71,0,132.9488,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,32768,512,torch.float8_e4m3fnuz,ck,84,0,73.9008,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,416,32768,1536,torch.float8_e4m3fnuz,ck,85,0,155.3704,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,448,32768,512,torch.float8_e4m3fnuz,ck,85,0,72.2692,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,448,32768,1536,torch.float8_e4m3fnuz,ck,85,0,153.3388,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,32768,512,torch.float8_e4m3fnuz,ck,102,0,81.2012,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,32768,1536,torch.float8_e4m3fnuz,ck,102,0,155.1241,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,32768,512,torch.float8_e4m3fnuz,ck,85,0,82.7804,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,32768,1536,torch.float8_e4m3fnuz,ck,71,0,167.8113,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1024,32768,512,torch.float8_e4m3fnuz,ck,71,0,148.0686,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1024,32768,1536,torch.float8_e4m3fnuz,ck,71,0,317.4319,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,2048,32768,512,torch.float8_e4m3fnuz,ck,71,0,283.8954,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,2048,32768,1536,torch.float8_e4m3fnuz,ck,71,0,610.8983,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,4096,32768,512,torch.float8_e4m3fnuz,ck,71,0,554.3222,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,4096,32768,1536,torch.float8_e4m3fnuz,ck,71,0,1193.686,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,6144,32768,512,torch.float8_e4m3fnuz,ck,71,0,793.4262,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,6144,32768,1536,torch.float8_e4m3fnuz,ck,102,0,1794.6908,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,32768,512,torch.float8_e4m3fnuz,ck,71,0,1053.245,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,32768,1536,torch.float8_e4m3fnuz,ck,71,0,2345.8711,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,10240,32768,512,torch.float8_e4m3fnuz,ck,72,0,1357.7711,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,10240,32768,1536,torch.float8_e4m3fnuz,ck,71,0,2923.8951,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,32768,512,torch.float8_e4m3fnuz,ck,71,0,1573.8691,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,32768,1536,torch.float8_e4m3fnuz,ck,71,0,3526.7572,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,32768,512,torch.float8_e4m3fnuz,ck,71,0,1827.3845,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,32768,1536,torch.float8_e4m3fnuz,ck,71,0,4108.8811,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,32768,512,torch.float8_e4m3fnuz,ck,71,0,2090.8366,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,32768,1536,torch.float8_e4m3fnuz,ck,71,0,4672.9569,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,32768,32768,512,torch.float8_e4m3fnuz,ck,71,0,4165.6734,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,32768,32768,1536,torch.float8_e4m3fnuz,ck,71,0,9432.9791,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,32768,512,torch.float8_e4m3fnuz,ck,74,0,1.0,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 -80,65536,32768,1536,torch.float8_e4m3fnuz,ck,0,0,1.0,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,98304,32768,1536,torch.float8_e4m3fnuz,ck,0,0,1.0,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1,36864,7168,torch.float8_e4m3fnuz,ck,6,0,76.2189,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,2,36864,7168,torch.float8_e4m3fnuz,ck,20,0,78.3165,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,0.0,0.0,0.0 -80,4,36864,7168,torch.float8_e4m3fnuz,ck,32,0,78.8657,a8w8_bpreshuffle_256x16x512x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v2,0.0,0.0,0.0 -80,8,36864,7168,torch.float8_e4m3fnuz,ck,111,0,79.5469,a8w8_bpreshuffle_256x16x512x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,16,36864,7168,torch.float8_e4m3fnuz,ck,6,0,80.7001,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,32,36864,7168,torch.float8_e4m3fnuz,ck,133,0,93.517,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 -80,64,36864,7168,torch.float8_e4m3fnuz,ck,121,0,129.978,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,96,36864,7168,torch.float8_e4m3fnuz,ck,102,0,155.5081,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,128,36864,7168,torch.float8_e4m3fnuz,ck,93,0,201.1007,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,160,36864,7168,torch.float8_e4m3fnuz,ck,156,0,267.3258,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 -80,192,36864,7168,torch.float8_e4m3fnuz,ck,94,0,286.0543,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,224,36864,7168,torch.float8_e4m3fnuz,ck,40,0,377.2671,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,256,36864,7168,torch.float8_e4m3fnuz,ck,68,0,371.2046,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,288,36864,7168,torch.float8_e4m3fnuz,ck,102,0,437.3066,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,320,36864,7168,torch.float8_e4m3fnuz,ck,128,0,476.2091,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,352,36864,7168,torch.float8_e4m3fnuz,ck,71,0,568.6663,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,384,36864,7168,torch.float8_e4m3fnuz,ck,94,0,555.2415,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,416,36864,7168,torch.float8_e4m3fnuz,ck,93,0,649.7863,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,448,36864,7168,torch.float8_e4m3fnuz,ck,93,0,631.7091,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,480,36864,7168,torch.float8_e4m3fnuz,ck,102,0,655.0492,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,512,36864,7168,torch.float8_e4m3fnuz,ck,68,0,726.0635,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,1024,36864,7168,torch.float8_e4m3fnuz,ck,93,0,1406.3674,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,2048,36864,7168,torch.float8_e4m3fnuz,ck,93,0,2759.912,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,4096,36864,7168,torch.float8_e4m3fnuz,ck,93,0,5492.6804,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,6144,36864,7168,torch.float8_e4m3fnuz,ck,93,0,8229.8778,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,8192,36864,7168,torch.float8_e4m3fnuz,ck,93,0,10983.7146,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,10240,36864,7168,torch.float8_e4m3fnuz,ck,93,0,13738.3404,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,12288,36864,7168,torch.float8_e4m3fnuz,ck,93,0,16426.5845,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,14336,36864,7168,torch.float8_e4m3fnuz,ck,93,0,19165.8682,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,16384,36864,7168,torch.float8_e4m3fnuz,ck,71,0,21975.6028,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,32768,36864,7168,torch.float8_e4m3fnuz,ck,93,0,43711.479,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -80,65536,36864,7168,torch.float8_e4m3fnuz,ck,0,0,1.0,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 -256,1,256,7168,torch.float8_e4m3fn,cktile,138,0,7.4399,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,0.49,247.68,0.0 -256,2,256,7168,torch.float8_e4m3fn,cktile,138,0,7.5203,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,0.98,246.05,0.0 -256,4,256,7168,torch.float8_e4m3fn,cktile,138,0,7.6107,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,1.93,245.15,0.0 -256,8,256,7168,torch.float8_e4m3fn,cktile,30,0,7.8028,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,3.76,243.05,0.0 -256,16,256,7168,torch.float8_e4m3fn,ck,10,0,6.9469,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,8.45,281.84,0.0 -256,32,256,7168,torch.float8_e4m3fn,ck,8,0,6.939,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,16.92,299.87,0.0 -256,64,256,7168,torch.float8_e4m3fn,ck,8,0,8.0014,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,29.35,290.77,0.0 -256,128,256,7168,torch.float8_e4m3fn,ck,10,0,9.8581,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,47.65,285.86,0.0 -256,256,256,7168,torch.float8_e4m3fn,ck,8,0,10.3554,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,90.73,367.06,0.0 -256,512,256,7168,torch.float8_e4m3fn,ck,10,0,10.5211,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,178.6,548.15,0.0 -256,1024,256,7168,torch.float8_e4m3fn,ck,5,0,11.7135,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,320.83,828.05,0.0 -256,2048,256,7168,torch.float8_e4m3fn,cktile,9,0,13.9827,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x128_default,537.54,1256.1,0.0 -256,4096,256,7168,torch.float8_e4m3fn,ck,114,0,20.5991,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,729.76,1616.2,0.0 -256,8192,256,7168,torch.float8_e4m3fn,cktile,24,0,30.5979,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x128_default,982.58,2116.14,0.0 -256,16384,256,7168,torch.float8_e4m3fn,ck,139,0,44.5889,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1348.53,2863.14,0.0 -256,32768,256,7168,torch.float8_e4m3fn,ck,66,0,75.1594,a8w8_bpreshuffle_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1600.05,3372.74,0.0 -256,1,2112,7168,torch.float8_e4m3fn,ck,10,0,11.5479,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.62,1311.94,0.0 -256,2,2112,7168,torch.float8_e4m3fn,cktile,30,0,11.761,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,5.15,1289.14,0.0 -256,4,2112,7168,torch.float8_e4m3fn,ck,10,0,11.4824,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.55,1322.41,0.0 -256,8,2112,7168,torch.float8_e4m3fn,ck,10,0,11.727,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,20.65,1298.71,0.0 -256,16,2112,7168,torch.float8_e4m3fn,ck,10,0,10.6299,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,45.57,1441.32,0.0 -256,32,2112,7168,torch.float8_e4m3fn,ck,8,0,10.4988,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,92.29,1476.68,0.0 -256,64,2112,7168,torch.float8_e4m3fn,ck,10,0,10.6664,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,181.67,1487.65,0.0 -256,96,2112,7168,torch.float8_e4m3fn,cktile,137,0,12.0882,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_16x64x512_1x4x1_16x16x128_default,240.45,1342.83,0.0 -256,128,2112,7168,torch.float8_e4m3fn,ck,5,0,13.7387,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,282.09,1208.05,0.0 -256,256,2112,7168,torch.float8_e4m3fn,cktile,152,0,16.4078,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,472.4,1100.4,0.0 -256,512,2112,7168,torch.float8_e4m3fn,cktile,22,0,22.4044,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x256_1x4x1_16x16x128_default,691.92,936.04,0.0 -256,1024,2112,7168,torch.float8_e4m3fn,cktile,225,0,30.7438,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_80x64x256_1x4x1_16x16x128_default,1008.47,871.86,0.0 -256,2048,2112,7168,torch.float8_e4m3fn,cktile,131,0,43.1862,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x192x256_1x4x1_16x16x128_default,1435.84,890.79,0.0 -256,4096,2112,7168,torch.float8_e4m3fn,cktile,131,0,68.6384,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x192x256_1x4x1_16x16x128_default,1806.82,900.38,0.0 -256,8192,2112,7168,torch.float8_e4m3fn,flydsl,485,0,119.6395,flydsl_bpreshuflle_128x192x128_F8_F8_B16_2x0x1x1_default,2073.18,906.57,0.0 -256,16384,2112,7168,torch.float8_e4m3fn,cktile,121,0,218.3492,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x128_default,2271.91,924.14,0.0 -256,32768,2112,7168,torch.float8_e4m3fn,cktile,121,0,404.4086,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x128_default,2453.3,960.49,0.0 -256,1,3072,1536,torch.float8_e4m3fn,ck,8,0,4.9938,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,1.89,946.43,0.0 -256,2,3072,1536,torch.float8_e4m3fn,ck,10,0,4.9926,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,3.78,948.19,0.0 -256,4,3072,1536,torch.float8_e4m3fn,ck,8,0,5.0475,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,7.48,940.92,0.0 -256,8,3072,1536,torch.float8_e4m3fn,ck,10,0,5.4956,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,13.74,869.79,0.0 -256,16,3072,1536,torch.float8_e4m3fn,ck,8,0,5.1326,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,29.42,943.28,0.0 -256,32,3072,1536,torch.float8_e4m3fn,flydsl,499,0,4.9778,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x1x0x1_default,60.67,997.3,0.0 -256,64,3072,1536,torch.float8_e4m3fn,flydsl,1289,0,5.0902,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x4_default,118.66,1023.56,0.0 -256,96,3072,1536,torch.float8_e4m3fn,flydsl,424,0,5.3379,flydsl_bpreshuflle_32x64x512_F8_F8_B16_1x0x1x1_default,169.72,1022.1,0.0 -256,128,3072,1536,torch.float8_e4m3fn,flydsl,1318,0,5.7223,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x4_default,211.1,996.39,0.0 -256,256,3072,1536,torch.float8_e4m3fn,ck,119,0,7.1386,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,338.43,936.41,0.0 -256,512,3072,1536,torch.float8_e4m3fn,ck,112,0,8.925,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,541.38,969.27,0.0 -256,1024,3072,1536,torch.float8_e4m3fn,flydsl,873,0,11.6927,flydsl_bpreshuflle_64x192x256_F8_F8_B16_2x1x0x2_default,826.47,1076.13,0.0 -256,2048,3072,1536,torch.float8_e4m3fn,flydsl,815,0,16.6873,flydsl_bpreshuflle_128x192x128_F8_F8_B16_2x0x1x2_default,1158.21,1225.32,0.0 -256,4096,3072,1536,torch.float8_e4m3fn,flydsl,651,0,24.8402,flydsl_bpreshuflle_128x192x128_F8_F8_B16_2x1x1x1_default,1556.14,1456.34,0.0 -256,8192,3072,1536,torch.float8_e4m3fn,flydsl,319,0,48.914,flydsl_bpreshuflle_128x192x128_F8_F8_B16_2x1x1x0_default,1580.52,1382.7,0.0 -256,16384,3072,1536,torch.float8_e4m3fn,flydsl,601,0,92.0763,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x1_default,1679.25,1417.82,0.0 -256,20480,3072,1536,torch.float8_e4m3fn,ck,58,0,125.715,a8w8_bpreshuffle_256x160x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1537.39,1288.67,0.0 -256,32768,3072,1536,torch.float8_e4m3fn,cktile,121,0,192.7525,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x128_default,1604.32,1330.08,0.0 -256,1,4096,512,torch.float8_e4m3fn,cktile,30,0,2.8621,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,1.47,735.77,0.0 -256,2,4096,512,torch.float8_e4m3fn,cktile,2,0,2.6128,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,3.21,809.31,0.0 -256,4,4096,512,torch.float8_e4m3fn,cktile,2,0,2.8132,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,5.96,757.84,0.0 -256,8,4096,512,torch.float8_e4m3fn,cktile,30,0,2.6844,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,12.5,807.18,0.0 -256,16,4096,512,torch.float8_e4m3fn,cktile,33,0,3.0517,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x256_1x4x1_16x16x128_default,21.99,732.84,0.0 -256,32,4096,512,torch.float8_e4m3fn,cktile,30,0,3.1527,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,42.57,753.54,0.0 -256,64,4096,512,torch.float8_e4m3fn,cktile,30,0,3.319,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,80.88,799.7,0.0 -256,128,4096,512,torch.float8_e4m3fn,cktile,30,0,3.4228,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,156.85,938.2,0.0 -256,256,4096,512,torch.float8_e4m3fn,flydsl,758,0,4.5124,flydsl_bpreshuflle_64x64x256_F8_F8_B16_1x0x1x2_default,237.95,958.55,0.0 -256,512,4096,512,torch.float8_e4m3fn,cktile,110,0,6.4827,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_48x64x256_1x4x1_16x16x128_default,331.26,1010.94,0.0 -256,1024,4096,512,torch.float8_e4m3fn,flydsl,16,0,8.0485,flydsl_bpreshuflle_64x256x128_F8_F8_B16_1x0x0x0_default,533.64,1367.96,0.0 -256,2048,4096,512,torch.float8_e4m3fn,ck,86,0,13.3525,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,643.32,1492.08,0.0 -256,4096,4096,512,torch.float8_e4m3fn,flydsl,186,0,16.832,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x0_default,1020.67,2242.68,0.0 -256,8192,4096,512,torch.float8_e4m3fn,ck,86,0,41.8389,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,821.24,1754.36,0.0 -256,16384,4096,512,torch.float8_e4m3fn,flydsl,269,0,63.474,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x0_default,1082.64,2279.73,0.0 -256,20480,4096,512,torch.float8_e4m3fn,ck,86,0,91.9844,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,933.85,1960.71,0.0 -256,32768,4096,512,torch.float8_e4m3fn,ck,86,0,143.1262,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,960.26,2007.39,0.0 -256,1,7168,2048,torch.float8_e4m3fn,cktile,30,0,5.8098,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,5.05,2529.6,0.0 -256,2,7168,2048,torch.float8_e4m3fn,cktile,137,0,5.8346,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_16x64x512_1x4x1_16x16x128_default,10.06,2521.65,0.0 -256,4,7168,2048,torch.float8_e4m3fn,cktile,138,0,5.8415,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,20.1,2524.28,0.0 -256,8,7168,2048,torch.float8_e4m3fn,cktile,138,0,5.9349,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,39.58,2495.6,0.0 -256,16,7168,2048,torch.float8_e4m3fn,ck,10,0,6.0372,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,77.81,2475.02,0.0 -256,32,7168,2048,torch.float8_e4m3fn,cktile,137,0,6.2634,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_16x64x512_1x4x1_16x16x128_default,150.0,2427.49,0.0 -256,64,7168,2048,torch.float8_e4m3fn,cktile,37,0,7.2353,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,259.71,2173.88,0.0 -256,96,7168,2048,torch.float8_e4m3fn,ck,113,0,8.5986,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,327.79,1890.18,0.0 -256,128,7168,2048,torch.float8_e4m3fn,ck,114,0,8.3615,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,449.45,2006.48,0.0 -256,256,7168,2048,torch.float8_e4m3fn,flydsl,565,0,11.2626,flydsl_bpreshuflle_128x64x256_F8_F8_B16_2x1x0x1_default,667.36,1675.84,0.0 -256,512,7168,2048,torch.float8_e4m3fn,flydsl,152,0,15.7151,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x0_default,956.56,1467.93,0.0 -256,1024,7168,2048,torch.float8_e4m3fn,ck,70,0,24.0038,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1252.5,1310.51,0.0 -256,2048,7168,2048,torch.float8_e4m3fn,flydsl,979,0,39.185,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,1534.5,1230.94,0.0 -256,4096,7168,2048,torch.float8_e4m3fn,flydsl,979,0,73.2445,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,1641.89,1116.66,0.0 -256,8192,7168,2048,torch.float8_e4m3fn,flydsl,979,0,130.9968,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,1836.06,1136.65,0.0 -256,16384,7168,2048,torch.float8_e4m3fn,flydsl,186,0,246.697,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x0_default,1949.91,1147.62,0.0 -256,20480,7168,2048,torch.float8_e4m3fn,cktile,115,0,316.101,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,1902.23,1107.95,0.0 -256,32768,7168,2048,torch.float8_e4m3fn,cktile,115,0,494.3189,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,1946.26,1115.78,0.0 +gfx,cu_num,M,N,K,q_dtype_w,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx942,80,1,64,7168,torch.float8_e4m3fnuz,ck,10,0,10.109,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.09,46.1,0.0 +gfx942,80,2,64,7168,torch.float8_e4m3fnuz,ck,10,0,10.1816,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.18,46.49,0.0 +gfx942,80,4,64,7168,torch.float8_e4m3fnuz,ck,10,0,10.1737,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.36,47.96,0.0 +gfx942,80,8,64,7168,torch.float8_e4m3fnuz,ck,10,0,10.4092,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.71,49.68,0.0 +gfx942,80,16,64,7168,torch.float8_e4m3fnuz,ck,24,0,8.826,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,1.66,65.2,0.0 +gfx942,80,32,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.4134,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,3.12,73.54,0.0 +gfx942,80,64,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.3656,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,6.27,98.84,0.0 +gfx942,80,96,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.3563,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,9.41,123.89,0.0 +gfx942,80,128,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.4481,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,12.43,147.4,0.0 +gfx942,80,160,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.4711,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,15.5,171.69,0.0 +gfx942,80,192,64,7168,torch.float8_e4m3fnuz,ck,24,0,9.4734,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,18.6,196.3,0.0 +gfx942,80,224,64,7168,torch.float8_e4m3fnuz,ck,10,0,9.6924,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,21.2,215.95,0.0 +gfx942,80,256,64,7168,torch.float8_e4m3fnuz,ck,10,0,9.9464,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,23.61,233.91,0.0 +gfx942,80,288,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.0283,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,26.35,255.28,0.0 +gfx942,80,320,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.6767,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,27.5,261.64,0.0 +gfx942,80,352,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.7355,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,30.08,281.96,0.0 +gfx942,80,384,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.7324,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,32.83,303.79,0.0 +gfx942,80,416,64,7168,torch.float8_e4m3fnuz,ck,10,0,10.8957,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,35.03,320.67,0.0 +gfx942,80,448,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.9127,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,37.67,341.56,0.0 +gfx942,80,480,64,7168,torch.float8_e4m3fnuz,ck,24,0,10.9898,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,40.07,360.41,0.0 +gfx942,80,512,64,7168,torch.float8_e4m3fnuz,ck,24,0,11.1383,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,42.18,376.57,0.0 +gfx942,80,1024,64,7168,torch.float8_e4m3fnuz,ck,5,0,12.4536,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,75.44,636.75,0.0 +gfx942,80,2048,64,7168,torch.float8_e4m3fnuz,ck,12,0,18.011,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,104.33,855.09,0.0 +gfx942,80,4096,64,7168,torch.float8_e4m3fnuz,ck,114,0,24.9271,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,150.76,1217.28,0.0 +gfx942,80,6144,64,7168,torch.float8_e4m3fnuz,ck,115,0,32.8306,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,171.7,1379.36,0.0 +gfx942,80,8192,64,7168,torch.float8_e4m3fnuz,ck,114,0,41.1456,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,182.67,1463.77,0.0 +gfx942,80,10240,64,7168,torch.float8_e4m3fnuz,ck,144,0,42.9825,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,218.58,1748.85,0.0 +gfx942,80,12288,64,7168,torch.float8_e4m3fnuz,ck,114,0,61.5548,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,183.16,1463.93,0.0 +gfx942,80,14336,64,7168,torch.float8_e4m3fnuz,ck,114,0,62.6178,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,210.06,1677.71,0.0 +gfx942,80,16384,64,7168,torch.float8_e4m3fnuz,ck,114,0,78.1928,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,192.25,1534.62,0.0 +gfx942,80,32768,64,7168,torch.float8_e4m3fnuz,ck,114,0,137.0183,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,219.42,1748.19,0.0 +gfx942,80,65536,64,7168,torch.float8_e4m3fnuz,ck,114,0,263.1602,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,228.49,1818.7,0.0 +gfx942,80,98304,64,7168,torch.float8_e4m3fnuz,ck,114,0,397.9789,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,226.63,1803.32,0.0 +gfx942,80,1,128,7168,torch.float8_e4m3fnuz,ck,25,0,10.735,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,2,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.1434,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.0606,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.4198,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,128,7168,torch.float8_e4m3fnuz,ck,19,0,10.0946,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,32,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.0298,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,64,128,7168,torch.float8_e4m3fnuz,ck,24,0,9.9198,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,96,128,7168,torch.float8_e4m3fnuz,ck,25,0,10.3566,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,128,128,7168,torch.float8_e4m3fnuz,ck,10,0,9.909,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,160,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.2346,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,192,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.2034,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,224,128,7168,torch.float8_e4m3fnuz,ck,24,0,10.2858,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,256,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.8778,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,288,128,7168,torch.float8_e4m3fnuz,ck,10,0,10.7298,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,320,128,7168,torch.float8_e4m3fnuz,ck,24,0,10.9094,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,352,128,7168,torch.float8_e4m3fnuz,ck,10,0,11.2254,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,384,128,7168,torch.float8_e4m3fnuz,ck,10,0,11.3822,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,416,128,7168,torch.float8_e4m3fnuz,ck,24,0,11.6798,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,448,128,7168,torch.float8_e4m3fnuz,ck,10,0,11.523,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,480,128,7168,torch.float8_e4m3fnuz,ck,10,0,11.607,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,512,128,7168,torch.float8_e4m3fnuz,ck,11,0,12.1131,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,1024,128,7168,torch.float8_e4m3fnuz,ck,6,0,17.8979,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2048,128,7168,torch.float8_e4m3fnuz,ck,119,0,24.0187,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4096,128,7168,torch.float8_e4m3fnuz,ck,121,0,38.9415,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,128,7168,torch.float8_e4m3fnuz,ck,123,0,49.1848,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,128,7168,torch.float8_e4m3fnuz,ck,124,0,61.8924,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,10240,128,7168,torch.float8_e4m3fnuz,ck,121,0,63.2764,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,128,7168,torch.float8_e4m3fnuz,ck,141,0,84.5069,a8w8_bpreshuffle_256x160x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,128,7168,torch.float8_e4m3fnuz,ck,86,0,91.5501,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,128,7168,torch.float8_e4m3fnuz,ck,0,0,114.6838,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,32768,128,7168,torch.float8_e4m3fnuz,ck,121,0,208.4886,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,128,7168,torch.float8_e4m3fnuz,ck,121,0,383.2232,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,128,7168,torch.float8_e4m3fnuz,ck,87,0,561.5334,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,128,7168,torch.float8_e4m3fnuz,ck,0,0,719.3472,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.212,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.36,180.44,0.0 +gfx942,80,2,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.52,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.7,175.89,0.0 +gfx942,80,4,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.571,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.39,176.49,0.0 +gfx942,80,8,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.72,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.74,176.91,0.0 +gfx942,80,16,256,7168,torch.float8_e4m3fnuz,ck,24,0,9.298,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,6.32,210.57,0.0 +gfx942,80,32,256,7168,torch.float8_e4m3fnuz,ck,24,0,9.6449,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,12.18,215.74,0.0 +gfx942,80,64,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.023,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,23.43,232.12,0.0 +gfx942,80,96,256,7168,torch.float8_e4m3fnuz,ck,24,0,10.2032,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,34.53,252.11,0.0 +gfx942,80,128,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.6508,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,44.11,264.59,0.0 +gfx942,80,160,256,7168,torch.float8_e4m3fnuz,ck,10,0,10.9094,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,53.83,280.84,0.0 +gfx942,80,192,256,7168,torch.float8_e4m3fnuz,ck,11,0,11.7012,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,60.22,282.84,0.0 +gfx942,80,224,256,7168,torch.float8_e4m3fnuz,ck,10,0,11.9205,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,68.96,298.25,0.0 +gfx942,80,256,256,7168,torch.float8_e4m3fnuz,ck,5,0,11.9933,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,78.34,316.93,0.0 +gfx942,80,288,256,7168,torch.float8_e4m3fnuz,ck,25,0,11.9972,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,88.1,337.32,0.0 +gfx942,80,320,256,7168,torch.float8_e4m3fnuz,ck,11,0,12.2086,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,96.19,351.61,0.0 +gfx942,80,352,256,7168,torch.float8_e4m3fnuz,ck,10,0,16.2524,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,79.49,279.24,0.0 +gfx942,80,384,256,7168,torch.float8_e4m3fnuz,ck,24,0,16.3243,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,86.33,293.07,0.0 +gfx942,80,416,256,7168,torch.float8_e4m3fnuz,ck,10,0,16.4786,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,92.65,305.24,0.0 +gfx942,80,448,256,7168,torch.float8_e4m3fnuz,ck,24,0,16.7118,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,98.38,315.68,0.0 +gfx942,80,480,256,7168,torch.float8_e4m3fnuz,ck,19,0,17.129,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,102.84,322.34,0.0 +gfx942,80,512,256,7168,torch.float8_e4m3fnuz,ck,6,0,17.9831,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,104.49,320.7,0.0 +gfx942,80,1024,256,7168,torch.float8_e4m3fnuz,ck,119,0,23.9594,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,156.85,404.82,0.0 +gfx942,80,2048,256,7168,torch.float8_e4m3fnuz,ck,114,0,37.6676,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,199.54,466.28,0.0 +gfx942,80,4096,256,7168,torch.float8_e4m3fnuz,ck,121,0,63.1504,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,238.04,527.19,0.0 +gfx942,80,6144,256,7168,torch.float8_e4m3fnuz,ck,136,0,77.3912,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,291.36,633.42,0.0 +gfx942,80,8192,256,7168,torch.float8_e4m3fnuz,ck,138,0,106.0081,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,283.61,610.8,0.0 +gfx942,80,10240,256,7168,torch.float8_e4m3fnuz,ck,0,0,113.4228,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,331.34,709.54,0.0 +gfx942,80,12288,256,7168,torch.float8_e4m3fnuz,ck,136,0,152.8106,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,295.12,629.58,0.0 +gfx942,80,14336,256,7168,torch.float8_e4m3fnuz,ck,102,0,161.1134,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,326.56,694.76,0.0 +gfx942,80,16384,256,7168,torch.float8_e4m3fnuz,ck,85,0,207.4081,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,289.91,615.52,0.0 +gfx942,80,32768,256,7168,torch.float8_e4m3fnuz,ck,0,0,379.1653,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,317.17,668.56,0.0 +gfx942,80,65536,256,7168,torch.float8_e4m3fnuz,ck,102,0,699.9897,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,343.6,721.66,0.0 +gfx942,80,98304,256,7168,torch.float8_e4m3fnuz,ck,102,0,973.4779,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,370.61,777.43,0.0 +gfx942,80,131072,256,7168,torch.float8_e4m3fnuz,ck,102,0,1385.804,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,347.12,727.71,0.0 +gfx942,80,1,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.7317,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.63,313.53,0.0 +gfx942,80,2,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.2907,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.6471,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.7575,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,512,7168,torch.float8_e4m3fnuz,ck,24,0,10.7923,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,108.82,352.2,0.0 +gfx942,80,32,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.1022,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,211.56,354.18,0.0 +gfx942,80,48,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.2279,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,313.79,361.89,0.0 +gfx942,80,64,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.1263,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,422.21,376.97,0.0 +gfx942,80,80,512,7168,torch.float8_e4m3fnuz,ck,24,0,11.2303,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,522.87,385.15,0.0 +gfx942,80,96,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.6635,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,604.14,382.08,0.0 +gfx942,80,112,512,7168,torch.float8_e4m3fnuz,ck,10,0,11.9903,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,685.62,382.6,0.0 +gfx942,80,128,512,7168,torch.float8_e4m3fnuz,ck,19,0,12.3367,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,761.57,382.48,0.0 +gfx942,80,160,512,7168,torch.float8_e4m3fnuz,ck,25,0,12.1467,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,192,512,7168,torch.float8_e4m3fnuz,ck,10,0,16.0911,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,224,512,7168,torch.float8_e4m3fnuz,ck,19,0,17.1051,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,256,512,7168,torch.float8_e4m3fnuz,ck,6,0,17.5859,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1068.5,327.94,0.0 +gfx942,80,288,512,7168,torch.float8_e4m3fnuz,ck,6,0,17.6475,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,320,512,7168,torch.float8_e4m3fnuz,ck,6,0,17.8315,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,352,512,7168,torch.float8_e4m3fnuz,ck,112,0,22.2763,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,384,512,7168,torch.float8_e4m3fnuz,ck,113,0,21.2655,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,416,512,7168,torch.float8_e4m3fnuz,ck,112,0,22.5591,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,448,512,7168,torch.float8_e4m3fnuz,ck,112,0,22.7188,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,480,512,7168,torch.float8_e4m3fnuz,ck,113,0,21.5703,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,512,512,7168,torch.float8_e4m3fnuz,ck,119,0,23.6639,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1588.11,332.33,0.0 +gfx942,80,1024,512,7168,torch.float8_e4m3fnuz,ck,114,0,36.6616,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2050.15,328.92,0.0 +gfx942,80,1536,512,7168,torch.float8_e4m3fnuz,ck,120,0,48.0728,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2345.25,338.09,0.0 +gfx942,80,2048,512,7168,torch.float8_e4m3fnuz,ck,85,0,58.1113,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2586.83,351.86,0.0 +gfx942,80,4096,512,7168,torch.float8_e4m3fnuz,ck,85,0,101.7067,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2956.03,366.0,0.0 +gfx942,80,6144,512,7168,torch.float8_e4m3fnuz,ck,85,0,143.4871,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,512,7168,torch.float8_e4m3fnuz,ck,85,0,195.7643,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3071.53,361.55,0.0 +gfx942,80,10240,512,7168,torch.float8_e4m3fnuz,ck,0,0,211.3264,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,512,7168,torch.float8_e4m3fnuz,ck,0,0,262.6171,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,512,7168,torch.float8_e4m3fnuz,ck,102,0,306.9194,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,512,7168,torch.float8_e4m3fnuz,ck,85,0,354.1062,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3396.13,389.4,0.0 +gfx942,80,20480,512,7168,torch.float8_e4m3fnuz,ck,70,0,411.9813,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3648.8,416.14,0.0 +gfx942,80,32768,512,7168,torch.float8_e4m3fnuz,ck,70,0,664.7692,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3618.07,409.32,0.0 +gfx942,80,65536,512,7168,torch.float8_e4m3fnuz,ck,70,0,1327.0618,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,512,7168,torch.float8_e4m3fnuz,ck,102,0,1866.4062,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,512,7168,torch.float8_e4m3fnuz,ck,102,0,2615.157,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1,576,7168,torch.float8_e4m3fnuz,ck,10,0,11.2398,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,576,7168,torch.float8_e4m3fnuz,ck,10,0,11.5102,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,576,7168,torch.float8_e4m3fnuz,ck,10,0,11.6411,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,576,7168,torch.float8_e4m3fnuz,ck,10,0,11.871,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,576,7168,torch.float8_e4m3fnuz,ck,10,0,10.7303,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,12.31,397.18,0.0 +gfx942,80,32,576,7168,torch.float8_e4m3fnuz,ck,10,0,11.199,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,235.95,392.45,0.0 +gfx942,80,64,576,7168,torch.float8_e4m3fnuz,ck,24,0,11.163,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,473.42,417.56,0.0 +gfx942,80,96,576,7168,torch.float8_e4m3fnuz,ck,10,0,12.065,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,128,576,7168,torch.float8_e4m3fnuz,ck,25,0,12.3755,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,854.08,419.68,0.0 +gfx942,80,160,576,7168,torch.float8_e4m3fnuz,ck,10,0,16.0783,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,192,576,7168,torch.float8_e4m3fnuz,ck,10,0,17.0079,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,224,576,7168,torch.float8_e4m3fnuz,ck,25,0,17.1379,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,256,576,7168,torch.float8_e4m3fnuz,ck,112,0,18.3779,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1150.26,340.56,0.0 +gfx942,80,288,576,7168,torch.float8_e4m3fnuz,ck,113,0,21.2123,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,320,576,7168,torch.float8_e4m3fnuz,ck,112,0,22.5067,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,352,576,7168,torch.float8_e4m3fnuz,ck,112,0,22.5607,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,384,576,7168,torch.float8_e4m3fnuz,ck,113,0,21.2619,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,416,576,7168,torch.float8_e4m3fnuz,ck,112,0,22.8351,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,448,576,7168,torch.float8_e4m3fnuz,ck,112,0,22.8423,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,480,576,7168,torch.float8_e4m3fnuz,ck,112,0,24.5819,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,512,576,7168,torch.float8_e4m3fnuz,ck,114,0,23.9199,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1767.51,350.7,0.0 +gfx942,80,1024,576,7168,torch.float8_e4m3fnuz,ck,114,0,37.1872,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2273.82,340.13,0.0 +gfx942,80,1536,576,7168,torch.float8_e4m3fnuz,ck,128,0,47.2304,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2685.47,358.0,0.0 +gfx942,80,2048,576,7168,torch.float8_e4m3fnuz,ck,129,0,60.7837,a8w8_bpreshuffle_256x80x192x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2782.23,348.25,0.0 +gfx942,80,4096,576,7168,torch.float8_e4m3fnuz,ck,93,0,113.3711,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2983.38,337.01,0.0 +gfx942,80,6144,576,7168,torch.float8_e4m3fnuz,ck,68,0,151.638,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,576,7168,torch.float8_e4m3fnuz,ck,93,0,191.4823,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3532.74,377.51,0.0 +gfx942,80,10240,576,7168,torch.float8_e4m3fnuz,ck,68,0,230.3376,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,576,7168,torch.float8_e4m3fnuz,ck,94,0,285.6522,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,576,7168,torch.float8_e4m3fnuz,ck,93,0,341.468,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,576,7168,torch.float8_e4m3fnuz,ck,68,0,372.1747,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3635.16,377.36,0.0 +gfx942,80,20480,576,7168,torch.float8_e4m3fnuz,ck,68,0,439.759,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3845.61,396.86,0.0 +gfx942,80,32768,576,7168,torch.float8_e4m3fnuz,ck,68,0,729.7961,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,576,7168,torch.float8_e4m3fnuz,ck,93,0,1429.9239,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,576,7168,torch.float8_e4m3fnuz,ck,95,0,2112.9835,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,576,7168,torch.float8_e4m3fnuz,ck,93,0,2822.5197,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1,1024,7168,torch.float8_e4m3fnuz,ck,10,0,12.1282,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.21,605.96,0.0 +gfx942,80,2,1024,7168,torch.float8_e4m3fnuz,ck,10,0,12.0277,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.44,611.79,0.0 +gfx942,80,4,1024,7168,torch.float8_e4m3fnuz,ck,10,0,12.1184,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,4.85,608.74,0.0 +gfx942,80,8,1024,7168,torch.float8_e4m3fnuz,ck,10,0,12.2066,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.62,607.36,0.0 +gfx942,80,16,1024,7168,torch.float8_e4m3fnuz,ck,10,0,11.306,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,20.77,662.26,0.0 +gfx942,80,32,1024,7168,torch.float8_e4m3fnuz,ck,10,0,11.4004,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,41.21,669.71,0.0 +gfx942,80,64,1024,7168,torch.float8_e4m3fnuz,ck,11,0,12.7183,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,73.87,623.5,0.0 +gfx942,80,96,1024,7168,torch.float8_e4m3fnuz,ck,19,0,17.1761,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,82.05,478.85,0.0 +gfx942,80,128,1024,7168,torch.float8_e4m3fnuz,ck,6,0,18.5394,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,101.35,459.54,0.0 +gfx942,80,160,1024,7168,torch.float8_e4m3fnuz,ck,6,0,18.3958,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,127.68,479.16,0.0 +gfx942,80,192,1024,7168,torch.float8_e4m3fnuz,ck,113,0,21.6602,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,130.13,420.56,0.0 +gfx942,80,224,1024,7168,torch.float8_e4m3fnuz,ck,113,0,24.174,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,136.03,389.03,0.0 +gfx942,80,256,1024,7168,torch.float8_e4m3fnuz,ck,114,0,24.24,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,155.04,400.14,0.0 +gfx942,80,288,1024,7168,torch.float8_e4m3fnuz,ck,119,0,24.2463,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,174.37,412.2,0.0 +gfx942,80,320,1024,7168,torch.float8_e4m3fnuz,ck,114,0,24.291,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,193.39,423.58,0.0 +gfx942,80,352,1024,7168,torch.float8_e4m3fnuz,ck,120,0,31.8392,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,162.3,332.42,0.0 +gfx942,80,384,1024,7168,torch.float8_e4m3fnuz,ck,120,0,29.8956,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,188.56,363.9,0.0 +gfx942,80,416,1024,7168,torch.float8_e4m3fnuz,ck,120,0,31.629,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,193.08,353.28,0.0 +gfx942,80,448,1024,7168,torch.float8_e4m3fnuz,ck,120,0,31.7988,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,206.82,360.67,0.0 +gfx942,80,480,1024,7168,torch.float8_e4m3fnuz,ck,120,0,29.9502,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,235.27,392.78,0.0 +gfx942,80,512,1024,7168,torch.float8_e4m3fnuz,ck,119,0,37.7242,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,199.24,319.65,0.0 +gfx942,80,1024,1024,7168,torch.float8_e4m3fnuz,ck,85,0,59.714,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,251.74,280.96,0.0 +gfx942,80,2048,1024,7168,torch.float8_e4m3fnuz,ck,85,0,105.1832,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,285.83,249.23,0.0 +gfx942,80,4096,1024,7168,torch.float8_e4m3fnuz,ck,85,0,187.4655,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.75,240.52,0.0 +gfx942,80,6144,1024,7168,torch.float8_e4m3fnuz,ck,85,0,267.9814,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,336.57,238.68,0.0 +gfx942,80,8192,1024,7168,torch.float8_e4m3fnuz,ck,85,0,349.6474,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,343.94,236.92,0.0 +gfx942,80,10240,1024,7168,torch.float8_e4m3fnuz,ck,70,0,424.2224,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,354.35,239.76,0.0 +gfx942,80,12288,1024,7168,torch.float8_e4m3fnuz,ck,85,0,519.3639,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,347.33,232.18,0.0 +gfx942,80,14336,1024,7168,torch.float8_e4m3fnuz,ck,85,0,598.3138,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,351.74,233.09,0.0 +gfx942,80,16384,1024,7168,torch.float8_e4m3fnuz,ck,70,0,681.6293,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,352.86,232.29,0.0 +gfx942,80,32768,1024,7168,torch.float8_e4m3fnuz,ck,85,0,1338.2851,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,359.44,231.14,0.0 +gfx942,80,65536,1024,7168,torch.float8_e4m3fnuz,ck,85,0,2666.7419,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,360.77,229.24,0.0 +gfx942,80,98304,1024,7168,torch.float8_e4m3fnuz,ck,102,0,3838.9675,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,375.91,237.91,0.0 +gfx942,80,1,1280,8192,torch.float8_e4m3fnuz,ck,10,0,13.1036,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.6,801.04,0.0 +gfx942,80,32,1280,8192,torch.float8_e4m3fnuz,ck,10,0,12.0442,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,55.72,899.17,0.0 +gfx942,80,64,1280,8192,torch.float8_e4m3fnuz,ck,19,0,13.7807,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,973.95,810.84,0.0 +gfx942,80,128,1280,8192,torch.float8_e4m3fnuz,ck,6,0,20.4871,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1310.27,579.0,0.0 +gfx942,80,192,1280,8192,torch.float8_e4m3fnuz,ck,113,0,23.6583,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,170.2,530.48,0.0 +gfx942,80,256,1280,8192,torch.float8_e4m3fnuz,ck,114,0,26.0071,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,206.43,509.03,0.0 +gfx942,80,320,1280,8192,torch.float8_e4m3fnuz,ck,115,0,32.2959,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2077.94,431.21,0.0 +gfx942,80,512,1280,8192,torch.float8_e4m3fnuz,ck,114,0,42.3216,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2537.1,377.84,0.0 +gfx942,80,1024,1280,8192,torch.float8_e4m3fnuz,ck,85,0,68.1209,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3152.46,315.55,0.0 +gfx942,80,2048,1280,8192,torch.float8_e4m3fnuz,ck,0,0,119.2848,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3600.6,272.51,0.0 +gfx942,80,4096,1280,8192,torch.float8_e4m3fnuz,ck,71,0,234.5689,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3662.01,232.45,0.0 +gfx942,80,8192,1280,8192,torch.float8_e4m3fnuz,ck,71,0,452.4299,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3797.24,217.86,0.0 +gfx942,80,16384,1280,8192,torch.float8_e4m3fnuz,ck,71,0,902.6619,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3806.49,206.77,0.0 +gfx942,80,1,1536,7168,torch.float8_e4m3fnuz,ck,10,0,11.721,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,1536,7168,torch.float8_e4m3fnuz,ck,10,0,12.1679,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,1536,7168,torch.float8_e4m3fnuz,ck,10,0,12.2579,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,1536,7168,torch.float8_e4m3fnuz,ck,10,0,12.559,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,1536,7168,torch.float8_e4m3fnuz,ck,10,0,10.6831,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,32.98,1045.94,0.0 +gfx942,80,32,1536,7168,torch.float8_e4m3fnuz,ck,11,0,12.4632,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,56.54,909.7,0.0 +gfx942,80,64,1536,7168,torch.float8_e4m3fnuz,ck,19,0,16.9363,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,832.11,688.78,0.0 +gfx942,80,96,1536,7168,torch.float8_e4m3fnuz,ck,6,0,18.0262,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,128,1536,7168,torch.float8_e4m3fnuz,ck,112,0,22.8955,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1231.06,538.13,0.0 +gfx942,80,160,1536,7168,torch.float8_e4m3fnuz,ck,119,0,24.4126,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,192,1536,7168,torch.float8_e4m3fnuz,ck,114,0,24.0194,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,224,1536,7168,torch.float8_e4m3fnuz,ck,115,0,31.7078,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,256,1536,7168,torch.float8_e4m3fnuz,ck,120,0,31.1851,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1807.64,437.12,0.0 +gfx942,80,288,1536,7168,torch.float8_e4m3fnuz,ck,120,0,29.6846,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,320,1536,7168,torch.float8_e4m3fnuz,ck,126,0,32.1157,a8w8_bpreshuffle_256x32x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,352,1536,7168,torch.float8_e4m3fnuz,ck,133,0,39.9737,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,384,1536,7168,torch.float8_e4m3fnuz,ck,114,0,37.6905,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,1536,7168,torch.float8_e4m3fnuz,ck,119,0,38.3401,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,448,1536,7168,torch.float8_e4m3fnuz,ck,113,0,46.246,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,480,1536,7168,torch.float8_e4m3fnuz,ck,113,0,45.4048,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,512,1536,7168,torch.float8_e4m3fnuz,ck,128,0,47.65,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2366.06,341.09,0.0 +gfx942,80,1024,1536,7168,torch.float8_e4m3fnuz,ck,136,0,74.0965,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,3043.14,290.11,0.0 +gfx942,80,1536,1536,7168,torch.float8_e4m3fnuz,ck,0,0,105.4423,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3207.71,253.59,0.0 +gfx942,80,2048,1536,7168,torch.float8_e4m3fnuz,ck,85,0,138.8692,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3247.46,230.3,0.0 +gfx942,80,4096,1536,7168,torch.float8_e4m3fnuz,ck,85,0,256.917,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3510.64,206.11,0.0 +gfx942,80,6144,1536,7168,torch.float8_e4m3fnuz,ck,93,0,368.8956,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,1536,7168,torch.float8_e4m3fnuz,ck,72,0,501.1581,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,3599.44,189.35,0.0 +gfx942,80,10240,1536,7168,torch.float8_e4m3fnuz,ck,68,0,583.4998,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,1536,7168,torch.float8_e4m3fnuz,ck,94,0,718.7933,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,1536,7168,torch.float8_e4m3fnuz,ck,93,0,830.4984,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,1536,7168,torch.float8_e4m3fnuz,ck,68,0,940.0969,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3837.66,190.17,0.0 +gfx942,80,20480,1536,7168,torch.float8_e4m3fnuz,ck,93,0,1157.9543,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3894.55,190.62,0.0 +gfx942,80,32768,1536,7168,torch.float8_e4m3fnuz,ck,68,0,1872.4274,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,1536,7168,torch.float8_e4m3fnuz,ck,93,0,3736.9156,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,1536,7168,torch.float8_e4m3fnuz,ck,102,0,5497.497,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,1536,7168,torch.float8_e4m3fnuz,ck,68,0,7429.8789,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1,2048,7168,torch.float8_e4m3fnuz,ck,10,0,12.1709,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.41,1207.09,0.0 +gfx942,80,2,2048,7168,torch.float8_e4m3fnuz,ck,10,0,11.7272,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.01,1253.72,0.0 +gfx942,80,4,2048,7168,torch.float8_e4m3fnuz,ck,10,0,12.2432,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.59,1202.72,0.0 +gfx942,80,8,2048,7168,torch.float8_e4m3fnuz,ck,10,0,12.4565,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,18.86,1185.74,0.0 +gfx942,80,16,2048,7168,torch.float8_e4m3fnuz,ck,10,0,11.2137,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,41.89,1325.19,0.0 +gfx942,80,32,2048,7168,torch.float8_e4m3fnuz,ck,11,0,12.645,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,74.3,1189.44,0.0 +gfx942,80,64,2048,7168,torch.float8_e4m3fnuz,ck,6,0,18.2067,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,103.21,845.9,0.0 +gfx942,80,96,2048,7168,torch.float8_e4m3fnuz,ck,113,0,21.5433,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,130.83,731.62,0.0 +gfx942,80,128,2048,7168,torch.float8_e4m3fnuz,ck,114,0,24.2173,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,155.18,665.72,0.0 +gfx942,80,160,2048,7168,torch.float8_e4m3fnuz,ck,119,0,24.4219,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,192.35,674.9,0.0 +gfx942,80,192,2048,7168,torch.float8_e4m3fnuz,ck,120,0,29.915,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,188.44,563.02,0.0 +gfx942,80,224,2048,7168,torch.float8_e4m3fnuz,ck,120,0,31.6736,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,207.64,543.14,0.0 +gfx942,80,256,2048,7168,torch.float8_e4m3fnuz,ck,114,0,38.2291,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,196.61,459.43,0.0 +gfx942,80,288,2048,7168,torch.float8_e4m3fnuz,ck,119,0,38.7116,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,218.43,463.02,0.0 +gfx942,80,320,2048,7168,torch.float8_e4m3fnuz,ck,114,0,38.5684,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,243.6,474.08,0.0 +gfx942,80,352,2048,7168,torch.float8_e4m3fnuz,ck,122,0,50.8883,a8w8_bpreshuffle_256x80x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,203.09,366.39,0.0 +gfx942,80,384,2048,7168,torch.float8_e4m3fnuz,ck,123,0,50.7212,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,222.28,374.7,0.0 +gfx942,80,416,2048,7168,torch.float8_e4m3fnuz,ck,120,0,52.9749,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,230.56,365.57,0.0 +gfx942,80,448,2048,7168,torch.float8_e4m3fnuz,ck,123,0,53.8786,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,244.13,366.13,0.0 +gfx942,80,480,2048,7168,torch.float8_e4m3fnuz,ck,123,0,51.3534,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,274.43,391.15,0.0 +gfx942,80,512,2048,7168,torch.float8_e4m3fnuz,ck,85,0,60.8768,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,246.93,335.88,0.0 +gfx942,80,1024,2048,7168,torch.float8_e4m3fnuz,ck,85,0,105.0089,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,286.31,249.64,0.0 +gfx942,80,2048,2048,7168,torch.float8_e4m3fnuz,ck,85,0,184.1052,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,326.6,205.04,0.0 +gfx942,80,4096,2048,7168,torch.float8_e4m3fnuz,ck,85,0,340.0794,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,353.62,178.83,0.0 +gfx942,80,6144,2048,7168,torch.float8_e4m3fnuz,ck,85,0,507.8765,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,355.18,165.17,0.0 +gfx942,80,8192,2048,7168,torch.float8_e4m3fnuz,ck,71,0,664.0129,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,362.22,161.07,0.0 +gfx942,80,10240,2048,7168,torch.float8_e4m3fnuz,ck,85,0,828.0104,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,363.1,157.03,0.0 +gfx942,80,12288,2048,7168,torch.float8_e4m3fnuz,ck,102,0,976.0123,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,369.64,156.85,0.0 +gfx942,80,14336,2048,7168,torch.float8_e4m3fnuz,ck,102,0,1150.9875,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.69,153.05,0.0 +gfx942,80,16384,2048,7168,torch.float8_e4m3fnuz,ck,85,0,1318.349,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,364.88,151.12,0.0 +gfx942,80,32768,2048,7168,torch.float8_e4m3fnuz,ck,85,0,2600.154,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,370.01,147.6,0.0 +gfx942,80,65536,2048,7168,torch.float8_e4m3fnuz,ck,85,0,5214.9027,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,368.97,144.37,0.0 +gfx942,80,98304,2048,7168,torch.float8_e4m3fnuz,ck,102,0,7585.8107,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,380.48,147.9,0.0 +gfx942,80,1,2112,7168,torch.float8_e4m3fnuz,ck,10,0,11.7937,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.57,1284.6,0.0 +gfx942,80,16,2112,7168,torch.float8_e4m3fnuz,ck,10,0,10.8571,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,44.62,1411.16,0.0 +gfx942,80,32,2112,7168,torch.float8_e4m3fnuz,ck,11,0,12.5633,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,77.12,1234.02,0.0 +gfx942,80,48,2112,7168,torch.float8_e4m3fnuz,ck,10,0,18.4707,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,786.83,849.22,0.0 +gfx942,80,64,2112,7168,torch.float8_e4m3fnuz,ck,12,0,18.5595,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,104.41,854.97,0.0 +gfx942,80,80,2112,7168,torch.float8_e4m3fnuz,ck,113,0,23.5729,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,102.75,680.87,0.0 +gfx942,80,96,2112,7168,torch.float8_e4m3fnuz,ck,113,0,21.5242,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,135.04,754.15,0.0 +gfx942,80,112,2112,7168,torch.float8_e4m3fnuz,ck,112,0,25.9619,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1306.18,632.26,0.0 +gfx942,80,128,2112,7168,torch.float8_e4m3fnuz,ck,114,0,23.6699,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,163.73,701.19,0.0 +gfx942,80,160,2112,7168,torch.float8_e4m3fnuz,ck,115,0,28.9398,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1673.96,586.1,0.0 +gfx942,80,192,2112,7168,torch.float8_e4m3fnuz,ck,126,0,32.1086,a8w8_bpreshuffle_256x32x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1810.51,539.61,0.0 +gfx942,80,224,2112,7168,torch.float8_e4m3fnuz,ck,126,0,32.5198,a8w8_bpreshuffle_256x32x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,2085.56,544.0,0.0 +gfx942,80,256,2112,7168,torch.float8_e4m3fnuz,ck,114,0,37.3908,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2072.99,482.88,0.0 +gfx942,80,288,2112,7168,torch.float8_e4m3fnuz,ck,113,0,44.2402,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1971.05,416.36,0.0 +gfx942,80,320,2112,7168,torch.float8_e4m3fnuz,ck,113,0,45.9122,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,2110.3,409.13,0.0 +gfx942,80,352,2112,7168,torch.float8_e4m3fnuz,ck,128,0,49.7251,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2143.33,385.09,0.0 +gfx942,80,384,2112,7168,torch.float8_e4m3fnuz,ck,128,0,47.5375,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2445.78,410.48,0.0 +gfx942,80,512,2112,7168,torch.float8_e4m3fnuz,ck,129,0,60.4509,a8w8_bpreshuffle_256x80x192x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2564.42,346.92,0.0 +gfx942,80,1024,2112,7168,torch.float8_e4m3fnuz,ck,93,0,115.8099,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2677.17,231.45,0.0 +gfx942,80,1536,2112,7168,torch.float8_e4m3fnuz,ck,68,0,148.9129,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3123.06,219.17,0.0 +gfx942,80,2048,2112,7168,torch.float8_e4m3fnuz,ck,93,0,186.5271,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3324.37,206.24,0.0 +gfx942,80,4096,2112,7168,torch.float8_e4m3fnuz,ck,93,0,329.2957,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3766.13,187.67,0.0 +gfx942,80,8192,2112,7168,torch.float8_e4m3fnuz,ck,68,0,649.3748,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3819.59,167.03,0.0 +gfx942,80,16384,2112,7168,torch.float8_e4m3fnuz,ck,93,0,1284.8613,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3860.87,157.05,0.0 +gfx942,80,32768,2112,7168,torch.float8_e4m3fnuz,ck,93,0,2566.2639,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3866.08,151.36,0.0 +gfx942,80,49152,2112,7168,torch.float8_e4m3fnuz,ck,93,0,3805.7733,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3910.39,151.11,0.0 +gfx942,80,65536,2112,7168,torch.float8_e4m3fnuz,ck,93,0,5062.5452,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3919.52,150.46,0.0 +gfx942,80,73728,2112,7168,torch.float8_e4m3fnuz,ck,93,0,5700.3689,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3916.08,150.0,0.0 +gfx942,80,98304,2112,7168,torch.float8_e4m3fnuz,ck,95,0,7892.5471,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,377.12,143.81,0.0 +gfx942,80,131072,2112,7168,torch.float8_e4m3fnuz,ck,93,0,10093.2233,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3931.9,149.44,0.0 +gfx942,80,1,2240,7168,torch.float8_e4m3fnuz,ck,10,0,11.693,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.75,1374.15,0.0 +gfx942,80,2,2240,7168,torch.float8_e4m3fnuz,ck,10,0,11.9922,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.36,1340.84,0.0 +gfx942,80,4,2240,7168,torch.float8_e4m3fnuz,ck,10,0,11.9683,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.73,1345.46,0.0 +gfx942,80,8,2240,7168,torch.float8_e4m3fnuz,ck,10,0,12.2626,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,20.95,1316.97,0.0 +gfx942,80,16,2240,7168,torch.float8_e4m3fnuz,ck,10,0,11.1279,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,461.72,1459.64,0.0 +gfx942,80,32,2240,7168,torch.float8_e4m3fnuz,ck,11,0,12.5136,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,82.12,1312.9,0.0 +gfx942,80,48,2240,7168,torch.float8_e4m3fnuz,ck,10,0,18.3626,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,839.43,904.85,0.0 +gfx942,80,64,2240,7168,torch.float8_e4m3fnuz,ck,12,0,18.5811,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,110.61,904.24,0.0 +gfx942,80,80,2240,7168,torch.float8_e4m3fnuz,ck,113,0,23.5984,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1088.64,719.89,0.0 +gfx942,80,96,2240,7168,torch.float8_e4m3fnuz,ck,113,0,21.5933,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,142.77,795.36,0.0 +gfx942,80,112,2240,7168,torch.float8_e4m3fnuz,ck,112,0,26.1124,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1377.36,664.85,0.0 +gfx942,80,128,2240,7168,torch.float8_e4m3fnuz,ck,114,0,23.9804,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1714.07,731.73,0.0 +gfx942,80,160,2240,7168,torch.float8_e4m3fnuz,ck,115,0,28.9759,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,192,2240,7168,torch.float8_e4m3fnuz,ck,113,0,32.2862,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1909.68,566.58,0.0 +gfx942,80,224,2240,7168,torch.float8_e4m3fnuz,ck,117,0,37.5099,a8w8_bpreshuffle_256x112x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,256,2240,7168,torch.float8_e4m3fnuz,ck,114,0,37.1525,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2212.73,512.43,0.0 +gfx942,80,288,2240,7168,torch.float8_e4m3fnuz,ck,113,0,44.5,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,320,2240,7168,torch.float8_e4m3fnuz,ck,112,0,53.8671,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,190.77,367.27,0.0 +gfx942,80,352,2240,7168,torch.float8_e4m3fnuz,ck,78,0,54.0168,a8w8_bpreshuffle_256x96x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,384,2240,7168,torch.float8_e4m3fnuz,ck,78,0,53.062,a8w8_bpreshuffle_256x96x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,2240,7168,torch.float8_e4m3fnuz,ck,113,0,62.1588,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,448,2240,7168,torch.float8_e4m3fnuz,ck,114,0,70.1253,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,2240,7168,torch.float8_e4m3fnuz,ck,62,0,70.4585,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,2240,7168,torch.float8_e4m3fnuz,ck,62,0,66.3638,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2477.51,331.81,0.0 +gfx942,80,1024,2240,7168,torch.float8_e4m3fnuz,ck,114,0,120.7469,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2723.33,231.76,0.0 +gfx942,80,1536,2240,7168,torch.float8_e4m3fnuz,ck,48,0,177.0619,a8w8_bpreshuffle_256x192x224x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2785.75,191.73,0.0 +gfx942,80,2048,2240,7168,torch.float8_e4m3fnuz,ck,79,0,230.9106,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2848.14,172.84,0.0 +gfx942,80,4096,2240,7168,torch.float8_e4m3fnuz,ck,69,0,437.1832,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3008.66,145.86,0.0 +gfx942,80,6144,2240,7168,torch.float8_e4m3fnuz,ck,69,0,662.2802,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,2240,7168,torch.float8_e4m3fnuz,ck,69,0,857.9718,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3066.15,129.93,0.0 +gfx942,80,10240,2240,7168,torch.float8_e4m3fnuz,ck,69,0,1018.0438,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,2240,7168,torch.float8_e4m3fnuz,ck,69,0,1231.0498,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,2240,7168,torch.float8_e4m3fnuz,ck,69,0,1461.7946,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,2240,7168,torch.float8_e4m3fnuz,ck,69,0,1644.8521,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3198.67,125.78,0.0 +gfx942,80,32768,2240,7168,torch.float8_e4m3fnuz,ck,69,0,3196.032,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3292.42,124.45,0.0 +gfx942,80,49152,2240,7168,torch.float8_e4m3fnuz,ck,69,0,4812.8899,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3279.53,122.29,0.0 +gfx942,80,65536,2240,7168,torch.float8_e4m3fnuz,ck,69,0,6365.1402,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3306.34,122.45,0.0 +gfx942,80,73728,2240,7168,torch.float8_e4m3fnuz,ck,69,0,7155.8623,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3308.62,122.26,0.0 +gfx942,80,98304,2240,7168,torch.float8_e4m3fnuz,ck,69,0,9755.5399,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,2240,7168,torch.float8_e4m3fnuz,ck,69,0,12728.0402,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3306.93,121.21,0.0 +gfx942,80,1,3072,1536,torch.float8_e4m3fnuz,ck,5,0,5.6562,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,3072,1536,torch.float8_e4m3fnuz,ck,11,0,5.911,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,3072,1536,torch.float8_e4m3fnuz,ck,11,0,5.8834,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,3072,1536,torch.float8_e4m3fnuz,ck,10,0,5.8254,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,3072,1536,torch.float8_e4m3fnuz,ck,11,0,5.8974,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,256.04,820.95,0.0 +gfx942,80,32,3072,1536,torch.float8_e4m3fnuz,ck,112,0,6.9922,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,431.9,709.98,0.0 +gfx942,80,64,3072,1536,torch.float8_e4m3fnuz,ck,114,0,9.0146,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,670.0,577.96,0.0 +gfx942,80,96,3072,1536,torch.float8_e4m3fnuz,ck,112,0,9.6664,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,937.24,564.42,0.0 +gfx942,80,128,3072,1536,torch.float8_e4m3fnuz,ck,112,0,11.1838,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1080.1,509.81,0.0 +gfx942,80,160,3072,1536,torch.float8_e4m3fnuz,ck,112,0,11.4913,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1313.99,517.56,0.0 +gfx942,80,192,3072,1536,torch.float8_e4m3fnuz,ck,119,0,12.4325,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1457.42,498.14,0.0 +gfx942,80,224,3072,1536,torch.float8_e4m3fnuz,ck,113,0,14.6673,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1441.25,439.0,0.0 +gfx942,80,256,3072,1536,torch.float8_e4m3fnuz,ck,120,0,16.0983,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1500.73,415.24,0.0 +gfx942,80,288,3072,1536,torch.float8_e4m3fnuz,ck,120,0,16.2173,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1675.93,427.35,0.0 +gfx942,80,320,3072,1536,torch.float8_e4m3fnuz,ck,119,0,16.7997,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1797.59,427.16,0.0 +gfx942,80,352,3072,1536,torch.float8_e4m3fnuz,ck,85,0,18.9609,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1751.97,391.43,0.0 +gfx942,80,384,3072,1536,torch.float8_e4m3fnuz,ck,85,0,18.7893,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1928.69,408.09,0.0 +gfx942,80,416,3072,1536,torch.float8_e4m3fnuz,ck,100,0,20.7247,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,448,3072,1536,torch.float8_e4m3fnuz,ck,92,0,22.3115,a8w8_bpreshuffle_256x32x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,480,3072,1536,torch.float8_e4m3fnuz,ck,94,0,22.3915,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,3072,1536,torch.float8_e4m3fnuz,ck,85,0,24.3751,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1982.28,354.9,0.0 +gfx942,80,1024,3072,1536,torch.float8_e4m3fnuz,ck,85,0,38.8336,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2488.48,324.02,0.0 +gfx942,80,1536,3072,1536,torch.float8_e4m3fnuz,ck,93,0,51.4668,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2816.48,320.89,0.0 +gfx942,80,2048,3072,1536,torch.float8_e4m3fnuz,ck,72,0,69.8541,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2766.82,292.71,0.0 +gfx942,80,4096,3072,1536,torch.float8_e4m3fnuz,ck,93,0,123.6652,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3125.75,292.53,0.0 +gfx942,80,6144,3072,1536,torch.float8_e4m3fnuz,ck,85,0,183.0209,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,3072,1536,torch.float8_e4m3fnuz,ck,71,0,235.4545,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3283.41,287.25,0.0 +gfx942,80,10240,3072,1536,torch.float8_e4m3fnuz,ck,71,0,290.0173,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,3072,1536,torch.float8_e4m3fnuz,ck,72,0,352.0628,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,14336,3072,1536,torch.float8_e4m3fnuz,ck,93,0,405.8954,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,3072,1536,torch.float8_e4m3fnuz,ck,71,0,450.9699,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3428.58,289.48,0.0 +gfx942,80,20480,3072,1536,torch.float8_e4m3fnuz,ck,71,0,557.954,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3463.97,290.36,0.0 +gfx942,80,32768,3072,1536,torch.float8_e4m3fnuz,ck,71,0,909.865,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,3072,1536,torch.float8_e4m3fnuz,ck,71,0,1787.3796,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,3072,1536,torch.float8_e4m3fnuz,ck,102,0,2698.4705,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,3072,1536,torch.float8_e4m3fnuz,ck,71,0,3559.9187,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1,4096,512,torch.float8_e4m3fnuz,ck,9,0,4.2138,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.95,499.75,0.0 +gfx942,80,1,4096,7168,torch.float8_e4m3fnuz,ck,11,0,13.4343,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,4096,512,torch.float8_e4m3fnuz,ck,23,0,4.4526,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,2,4096,7168,torch.float8_e4m3fnuz,ck,11,0,13.2939,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,4096,512,torch.float8_e4m3fnuz,ck,23,0,4.5342,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,4,4096,7168,torch.float8_e4m3fnuz,ck,5,0,13.5363,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,4096,512,torch.float8_e4m3fnuz,ck,9,0,4.1858,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,4096,7168,torch.float8_e4m3fnuz,ck,11,0,13.6103,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,4096,512,torch.float8_e4m3fnuz,ck,9,0,4.2875,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,15.65,521.61,0.0 +gfx942,80,16,4096,7168,torch.float8_e4m3fnuz,ck,11,0,13.1683,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,32,4096,512,torch.float8_e4m3fnuz,ck,9,0,4.8822,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,274.91,486.6,0.0 +gfx942,80,32,4096,7168,torch.float8_e4m3fnuz,ck,12,0,18.5983,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,48,4096,512,torch.float8_e4m3fnuz,ck,9,0,5.3534,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,376.07,469.78,0.0 +gfx942,80,64,4096,512,torch.float8_e4m3fnuz,ck,77,0,5.8414,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,459.54,454.38,0.0 +gfx942,80,64,4096,7168,torch.float8_e4m3fnuz,ck,114,0,23.8807,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,80,4096,512,torch.float8_e4m3fnuz,ck,76,0,6.8102,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,492.71,410.19,0.0 +gfx942,80,96,4096,512,torch.float8_e4m3fnuz,ck,76,0,6.6322,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,607.12,442.2,0.0 +gfx942,80,96,4096,7168,torch.float8_e4m3fnuz,ck,120,0,29.6988,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,112,4096,512,torch.float8_e4m3fnuz,ck,76,0,7.3154,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,642.15,419.94,0.0 +gfx942,80,128,4096,512,torch.float8_e4m3fnuz,ck,76,0,7.2018,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,745.47,445.9,0.0 +gfx942,80,128,4096,7168,torch.float8_e4m3fnuz,ck,121,0,38.8588,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,160,4096,512,torch.float8_e4m3fnuz,ck,84,0,7.7118,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,160,4096,7168,torch.float8_e4m3fnuz,ck,119,0,40.6369,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,192,4096,512,torch.float8_e4m3fnuz,ck,84,0,9.9422,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,192,4096,7168,torch.float8_e4m3fnuz,ck,123,0,49.1457,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,224,4096,512,torch.float8_e4m3fnuz,ck,76,0,10.187,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,224,4096,7168,torch.float8_e4m3fnuz,ck,120,0,52.8765,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,256,4096,512,torch.float8_e4m3fnuz,ck,76,0,10.9814,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,977.78,393.88,0.0 +gfx942,80,256,4096,7168,torch.float8_e4m3fnuz,ck,85,0,60.0334,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,288,4096,512,torch.float8_e4m3fnuz,ck,85,0,10.8502,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,288,4096,7168,torch.float8_e4m3fnuz,ck,85,0,62.9998,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,320,4096,512,torch.float8_e4m3fnuz,ck,84,0,10.783,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,320,4096,7168,torch.float8_e4m3fnuz,ck,121,0,63.2822,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,352,4096,512,torch.float8_e4m3fnuz,ck,84,0,12.0603,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,352,4096,7168,torch.float8_e4m3fnuz,ck,136,0,74.9351,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,384,4096,512,torch.float8_e4m3fnuz,ck,84,0,12.2599,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,384,4096,7168,torch.float8_e4m3fnuz,ck,86,0,81.866,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,4096,512,torch.float8_e4m3fnuz,ck,85,0,13.5035,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,4096,7168,torch.float8_e4m3fnuz,ck,85,0,86.134,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,4096,512,torch.float8_e4m3fnuz,ck,85,0,13.3723,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,4096,7168,torch.float8_e4m3fnuz,ck,85,0,84.5872,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,4096,512,torch.float8_e4m3fnuz,ck,86,0,14.0771,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,4096,7168,torch.float8_e4m3fnuz,ck,86,0,82.8028,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,4096,512,torch.float8_e4m3fnuz,ck,85,0,15.7231,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1365.81,416.81,0.0 +gfx942,80,512,4096,7168,torch.float8_e4m3fnuz,ck,138,0,102.7541,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1024,4096,512,torch.float8_e4m3fnuz,ck,85,0,24.9039,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1724.62,442.1,0.0 +gfx942,80,1024,4096,7168,torch.float8_e4m3fnuz,ck,85,0,179.0322,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1536,4096,512,torch.float8_e4m3fnuz,ck,85,0,33.2015,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1940.41,465.84,0.0 +gfx942,80,2048,4096,512,torch.float8_e4m3fnuz,ck,71,0,43.6672,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1967.14,456.25,0.0 +gfx942,80,2048,4096,7168,torch.float8_e4m3fnuz,ck,85,0,328.802,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,4096,4096,512,torch.float8_e4m3fnuz,ck,71,0,75.7421,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2268.21,498.39,0.0 +gfx942,80,4096,4096,7168,torch.float8_e4m3fnuz,ck,85,0,645.3373,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,4096,512,torch.float8_e4m3fnuz,ck,71,0,109.7738,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,4096,7168,torch.float8_e4m3fnuz,ck,102,0,937.0884,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,4096,512,torch.float8_e4m3fnuz,ck,71,0,140.1016,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2452.49,523.91,0.0 +gfx942,80,8192,4096,7168,torch.float8_e4m3fnuz,ck,85,0,1259.5274,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,10240,4096,512,torch.float8_e4m3fnuz,ck,71,0,173.8713,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,10240,4096,7168,torch.float8_e4m3fnuz,ck,85,0,1574.2695,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,4096,512,torch.float8_e4m3fnuz,ck,71,0,207.219,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,4096,7168,torch.float8_e4m3fnuz,ck,102,0,1858.9016,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,4096,512,torch.float8_e4m3fnuz,ck,71,0,239.3595,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,4096,7168,torch.float8_e4m3fnuz,ck,85,0,2197.8676,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,4096,512,torch.float8_e4m3fnuz,ck,71,0,268.2002,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2562.25,539.54,0.0 +gfx942,80,16384,4096,7168,torch.float8_e4m3fnuz,ck,85,0,2498.7773,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,20480,4096,512,torch.float8_e4m3fnuz,ck,71,0,331.5621,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2590.75,543.96,0.0 +gfx942,80,32768,4096,512,torch.float8_e4m3fnuz,ck,71,0,523.3298,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2626.24,549.0,0.0 +gfx942,80,32768,4096,7168,torch.float8_e4m3fnuz,ck,74,0,5016.1101,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,65536,4096,512,torch.float8_e4m3fnuz,ck,71,0,1055.1661,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,4096,7168,torch.float8_e4m3fnuz,ck,85,0,9976.6212,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,4096,512,torch.float8_e4m3fnuz,ck,71,0,1607.6098,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,4096,7168,torch.float8_e4m3fnuz,ck,102,0,14573.0079,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,4096,512,torch.float8_e4m3fnuz,ck,71,0,2125.6078,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,4096,7168,torch.float8_e4m3fnuz,ck,85,0,19941.3998,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1,4608,4096,torch.float8_e4m3fnuz,ck,5,0,9.6645,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,3.91,1954.34,0.0 +gfx942,80,1,4608,7168,torch.float8_e4m3fnuz,ck,5,0,13.8515,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,47.69,2385.77,0.0 +gfx942,80,2,4608,4096,torch.float8_e4m3fnuz,ck,11,0,9.5889,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,7.87,1971.13,0.0 +gfx942,80,2,4608,7168,torch.float8_e4m3fnuz,ck,5,0,13.597,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,4608,4096,torch.float8_e4m3fnuz,ck,11,0,9.7763,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,15.44,1936.07,0.0 +gfx942,80,4,4608,7168,torch.float8_e4m3fnuz,ck,5,0,13.6563,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,4608,4096,torch.float8_e4m3fnuz,ck,10,0,9.6358,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,31.34,1969.83,0.0 +gfx942,80,8,4608,7168,torch.float8_e4m3fnuz,ck,5,0,13.8099,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,4608,4096,torch.float8_e4m3fnuz,ck,10,0,9.4967,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,63.6,2009.89,0.0 +gfx942,80,16,4608,7168,torch.float8_e4m3fnuz,ck,11,0,13.1327,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,80.48,2535.07,0.0 +gfx942,80,32,4608,4096,torch.float8_e4m3fnuz,ck,12,0,12.9381,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,93.36,1491.75,0.0 +gfx942,80,32,4608,7168,torch.float8_e4m3fnuz,ck,12,0,19.2855,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,109.61,1739.88,0.0 +gfx942,80,48,4608,7168,torch.float8_e4m3fnuz,ck,113,0,21.7204,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,145.99,1556.9,0.0 +gfx942,80,64,4608,4096,torch.float8_e4m3fnuz,ck,114,0,16.3715,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1475.69,1204.92,0.0 +gfx942,80,64,4608,7168,torch.float8_e4m3fnuz,ck,114,0,23.8378,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,177.36,1429.61,0.0 +gfx942,80,80,4608,7168,torch.float8_e4m3fnuz,ck,115,0,28.763,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,183.74,1193.92,0.0 +gfx942,80,96,4608,7168,torch.float8_e4m3fnuz,ck,120,0,29.4707,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2151.9,1174.15,0.0 +gfx942,80,112,4608,7168,torch.float8_e4m3fnuz,ck,117,0,37.2568,a8w8_bpreshuffle_256x112x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1985.88,935.81,0.0 +gfx942,80,128,4608,4096,torch.float8_e4m3fnuz,ck,114,0,24.38,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,198.19,844.06,0.0 +gfx942,80,128,4608,7168,torch.float8_e4m3fnuz,ck,114,0,38.4671,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,219.82,913.18,0.0 +gfx942,80,160,4608,7168,torch.float8_e4m3fnuz,ck,122,0,46.244,a8w8_bpreshuffle_256x80x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,192,4608,7168,torch.float8_e4m3fnuz,ck,128,0,47.9264,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,224,4608,7168,torch.float8_e4m3fnuz,ck,124,0,56.3824,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,256,4608,4096,torch.float8_e4m3fnuz,ck,85,0,37.982,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2544.28,586.65,0.0 +gfx942,80,256,4608,7168,torch.float8_e4m3fnuz,ck,121,0,62.2609,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2716.22,597.88,0.0 +gfx942,80,288,4608,7168,torch.float8_e4m3fnuz,ck,130,0,66.7012,a8w8_bpreshuffle_256x96x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,320,4608,7168,torch.float8_e4m3fnuz,ck,136,0,72.1189,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,352,4608,7168,torch.float8_e4m3fnuz,ck,128,0,84.2541,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,384,4608,7168,torch.float8_e4m3fnuz,ck,93,0,81.9457,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,4608,7168,torch.float8_e4m3fnuz,ck,138,0,101.4186,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,4608,7168,torch.float8_e4m3fnuz,ck,138,0,97.0854,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,4608,7168,torch.float8_e4m3fnuz,ck,56,0,105.9346,a8w8_bpreshuffle_256x160x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,4608,7168,torch.float8_e4m3fnuz,ck,70,0,105.5819,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3203.47,392.29,0.0 +gfx942,80,1024,4608,4096,torch.float8_e4m3fnuz,ck,71,0,120.44,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3209.46,269.89,0.0 +gfx942,80,1024,4608,7168,torch.float8_e4m3fnuz,ck,93,0,189.1551,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3576.2,263.31,0.0 +gfx942,80,1536,4608,7168,torch.float8_e4m3fnuz,ck,85,0,279.9019,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3625.15,207.92,0.0 +gfx942,80,2048,4608,4096,torch.float8_e4m3fnuz,ck,68,0,218.9032,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3531.67,210.77,0.0 +gfx942,80,2048,4608,7168,torch.float8_e4m3fnuz,ck,93,0,366.0791,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3695.69,181.89,0.0 +gfx942,80,4096,4608,4096,torch.float8_e4m3fnuz,ck,85,0,420.6946,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3675.32,174.47,0.0 +gfx942,80,4096,4608,7168,torch.float8_e4m3fnuz,ck,85,0,715.7879,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3780.21,139.9,0.0 +gfx942,80,6144,4608,7168,torch.float8_e4m3fnuz,ck,102,0,1071.1237,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,4608,7168,torch.float8_e4m3fnuz,ck,93,0,1405.0902,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3851.47,119.03,0.0 +gfx942,80,10240,4608,7168,torch.float8_e4m3fnuz,ck,93,0,1722.172,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,4608,7168,torch.float8_e4m3fnuz,ck,68,0,2079.1883,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,4608,7168,torch.float8_e4m3fnuz,ck,93,0,2431.1066,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,4608,4096,torch.float8_e4m3fnuz,ck,93,0,1613.6388,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3832.8,146.86,0.0 +gfx942,80,16384,4608,7168,torch.float8_e4m3fnuz,ck,93,0,2770.8265,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3906.17,108.8,0.0 +gfx942,80,20480,4608,7168,torch.float8_e4m3fnuz,ck,68,0,3421.2891,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3954.4,107.73,0.0 +gfx942,80,32768,4608,4096,torch.float8_e4m3fnuz,ck,93,0,3207.2182,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3856.77,141.89,0.0 +gfx942,80,32768,4608,7168,torch.float8_e4m3fnuz,ck,68,0,5527.6346,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3916.08,103.1,0.0 +gfx942,80,65536,4608,7168,torch.float8_e4m3fnuz,ck,68,0,11016.6648,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,4608,7168,torch.float8_e4m3fnuz,ck,102,0,16377.2032,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,4608,7168,torch.float8_e4m3fnuz,ck,93,0,21987.1209,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1,6144,1536,torch.float8_e4m3fnuz,ck,10,0,7.4756,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.52,1264.25,0.0 +gfx942,80,2,6144,1536,torch.float8_e4m3fnuz,ck,10,0,7.3933,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.11,1280.19,0.0 +gfx942,80,4,6144,1536,torch.float8_e4m3fnuz,ck,10,0,7.5748,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.97,1253.17,0.0 +gfx942,80,8,6144,1536,torch.float8_e4m3fnuz,ck,10,0,7.6156,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,19.83,1253.71,0.0 +gfx942,80,16,6144,1536,torch.float8_e4m3fnuz,ck,10,0,7.8653,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,38.4,1227.97,0.0 +gfx942,80,32,6144,1536,torch.float8_e4m3fnuz,ck,112,0,9.748,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,61.96,1013.5,0.0 +gfx942,80,64,6144,1536,torch.float8_e4m3fnuz,ck,112,0,12.3918,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,97.48,832.96,0.0 +gfx942,80,96,6144,1536,torch.float8_e4m3fnuz,ck,119,0,13.368,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,135.54,805.23,0.0 +gfx942,80,128,6144,1536,torch.float8_e4m3fnuz,ck,112,0,17.1016,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,141.27,655.3,0.0 +gfx942,80,160,6144,1536,torch.float8_e4m3fnuz,ck,119,0,17.7021,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,170.6,658.06,0.0 +gfx942,80,192,6144,1536,torch.float8_e4m3fnuz,ck,121,0,20.6837,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,175.2,584.59,0.0 +gfx942,80,224,6144,1536,torch.float8_e4m3fnuz,ck,92,0,24.1353,a8w8_bpreshuffle_256x32x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,175.17,519.31,0.0 +gfx942,80,256,6144,1536,torch.float8_e4m3fnuz,ck,85,0,26.6995,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,180.97,486.01,0.0 +gfx942,80,288,6144,1536,torch.float8_e4m3fnuz,ck,86,0,27.4257,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,198.2,489.27,0.0 +gfx942,80,320,6144,1536,torch.float8_e4m3fnuz,ck,93,0,28.1164,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,214.81,492.98,0.0 +gfx942,80,352,6144,1536,torch.float8_e4m3fnuz,ck,85,0,33.8656,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,196.18,422.35,0.0 +gfx942,80,384,6144,1536,torch.float8_e4m3fnuz,ck,85,0,33.7439,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,214.79,436.99,0.0 +gfx942,80,416,6144,1536,torch.float8_e4m3fnuz,ck,93,0,37.9824,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,206.72,399.87,0.0 +gfx942,80,448,6144,1536,torch.float8_e4m3fnuz,ck,93,0,37.7868,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,223.77,413.65,0.0 +gfx942,80,480,6144,1536,torch.float8_e4m3fnuz,ck,94,0,38.3828,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,236.04,418.75,0.0 +gfx942,80,512,6144,1536,torch.float8_e4m3fnuz,ck,85,0,42.8776,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,225.38,385.17,0.0 +gfx942,80,1024,6144,1536,torch.float8_e4m3fnuz,ck,85,0,74.8161,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,258.33,315.35,0.0 +gfx942,80,2048,6144,1536,torch.float8_e4m3fnuz,ck,71,0,137.2984,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,281.54,274.94,0.0 +gfx942,80,4096,6144,1536,torch.float8_e4m3fnuz,ck,71,0,262.0027,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,295.07,252.14,0.0 +gfx942,80,6144,6144,1536,torch.float8_e4m3fnuz,ck,71,0,375.4724,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,308.85,251.34,0.0 +gfx942,80,8192,6144,1536,torch.float8_e4m3fnuz,ck,71,0,500.5056,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,308.93,245.12,0.0 +gfx942,80,10240,6144,1536,torch.float8_e4m3fnuz,ck,71,0,617.2362,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,313.13,244.63,0.0 +gfx942,80,12288,6144,1536,torch.float8_e4m3fnuz,ck,71,0,738.7716,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,313.94,242.71,0.0 +gfx942,80,14336,6144,1536,torch.float8_e4m3fnuz,ck,71,0,860.2231,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,314.55,241.35,0.0 +gfx942,80,16384,6144,1536,torch.float8_e4m3fnuz,ck,71,0,983.6366,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,314.38,239.85,0.0 +gfx942,80,32768,6144,1536,torch.float8_e4m3fnuz,ck,71,0,1953.3257,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,316.63,236.74,0.0 +gfx942,80,65536,6144,1536,torch.float8_e4m3fnuz,ck,71,0,3873.7845,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,319.31,236.31,0.0 +gfx942,80,98304,6144,1536,torch.float8_e4m3fnuz,ck,71,0,5774.5261,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.31,236.97,0.0 +gfx942,80,1,7168,256,torch.float8_e4m3fnuz,ck,75,0,6.4606,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,5.68,286.29,0.0 +gfx942,80,1,7168,512,torch.float8_e4m3fnuz,ck,9,0,5.6245,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.31,655.15,0.0 +gfx942,80,1,7168,1024,torch.float8_e4m3fnuz,ck,15,0,6.6648,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,2.2,1103.62,0.0 +gfx942,80,1,7168,2048,torch.float8_e4m3fnuz,ck,10,0,9.1526,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,1,7168,2304,torch.float8_e4m3fnuz,ck,108,0,9.2341,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,3.58,1790.29,0.0 +gfx942,80,1,7168,4096,torch.float8_e4m3fnuz,ck,10,0,11.7478,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.0,2500.77,0.0 +gfx942,80,1,7168,4608,torch.float8_e4m3fnuz,ck,24,0,13.6306,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,4.85,2424.62,0.0 +gfx942,80,1,7168,8192,torch.float8_e4m3fnuz,ck,10,0,20.2876,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.79,2895.5,0.0 +gfx942,80,1,7168,9216,torch.float8_e4m3fnuz,ck,10,0,22.841,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.78,2893.21,0.0 +gfx942,80,1,7168,16384,torch.float8_e4m3fnuz,ck,24,0,39.678,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,1,7168,18432,torch.float8_e4m3fnuz,ck,10,0,42.556,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,7168,256,torch.float8_e4m3fnuz,ck,75,0,6.3266,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,7168,512,torch.float8_e4m3fnuz,ck,9,0,5.6086,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.62,659.65,0.0 +gfx942,80,2,7168,1024,torch.float8_e4m3fnuz,ck,15,0,6.7221,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,4.37,1096.5,0.0 +gfx942,80,2,7168,2048,torch.float8_e4m3fnuz,ck,5,0,8.5166,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,7168,2304,torch.float8_e4m3fnuz,ck,29,0,9.905,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,2,7168,4096,torch.float8_e4m3fnuz,ck,10,0,11.8135,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.94,2488.42,0.0 +gfx942,80,2,7168,4608,torch.float8_e4m3fnuz,ck,10,0,13.4089,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.85,2466.13,0.0 +gfx942,80,2,7168,8192,torch.float8_e4m3fnuz,ck,10,0,20.503,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,11.46,2866.18,0.0 +gfx942,80,2,7168,9216,torch.float8_e4m3fnuz,ck,10,0,22.8918,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,11.54,2887.82,0.0 +gfx942,80,2,7168,16384,torch.float8_e4m3fnuz,ck,10,0,37.3783,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,7168,18432,torch.float8_e4m3fnuz,ck,10,0,42.2924,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,7168,256,torch.float8_e4m3fnuz,ck,73,0,6.9558,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,7168,512,torch.float8_e4m3fnuz,ck,9,0,5.6728,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.18,657.42,0.0 +gfx942,80,4,7168,1024,torch.float8_e4m3fnuz,ck,108,0,6.7173,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,8.74,1101.85,0.0 +gfx942,80,4,7168,2048,torch.float8_e4m3fnuz,ck,108,0,8.5722,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,7168,2304,torch.float8_e4m3fnuz,ck,108,0,9.9551,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,7168,4096,torch.float8_e4m3fnuz,ck,10,0,11.9762,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,19.61,2457.7,0.0 +gfx942,80,4,7168,4608,torch.float8_e4m3fnuz,ck,24,0,13.4779,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,19.61,2456.31,0.0 +gfx942,80,4,7168,8192,torch.float8_e4m3fnuz,ck,10,0,20.5581,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,22.85,2860.69,0.0 +gfx942,80,4,7168,9216,torch.float8_e4m3fnuz,ck,10,0,23.1344,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,22.84,2859.57,0.0 +gfx942,80,4,7168,16384,torch.float8_e4m3fnuz,ck,10,0,37.6428,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,7168,18432,torch.float8_e4m3fnuz,ck,10,0,42.1052,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,7168,256,torch.float8_e4m3fnuz,ck,75,0,6.2842,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,7168,512,torch.float8_e4m3fnuz,ck,9,0,5.6701,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.36,668.21,0.0 +gfx942,80,8,7168,1024,torch.float8_e4m3fnuz,ck,15,0,6.6192,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,17.74,1127.46,0.0 +gfx942,80,8,7168,2048,torch.float8_e4m3fnuz,ck,24,0,8.685,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,8,7168,2304,torch.float8_e4m3fnuz,ck,108,0,10.0038,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,7168,4096,torch.float8_e4m3fnuz,ck,10,0,11.8869,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,39.52,2482.36,0.0 +gfx942,80,8,7168,4608,torch.float8_e4m3fnuz,ck,24,0,13.8728,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,38.09,2391.85,0.0 +gfx942,80,8,7168,8192,torch.float8_e4m3fnuz,ck,10,0,20.8381,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,45.09,2826.58,0.0 +gfx942,80,8,7168,9216,torch.float8_e4m3fnuz,ck,24,0,24.049,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,43.95,2754.74,0.0 +gfx942,80,8,7168,16384,torch.float8_e4m3fnuz,ck,10,0,39.2255,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,7168,18432,torch.float8_e4m3fnuz,ck,24,0,45.0508,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,16,7168,256,torch.float8_e4m3fnuz,ck,75,0,6.629,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,88.58,312.03,0.0 +gfx942,80,16,7168,512,torch.float8_e4m3fnuz,ck,9,0,5.4877,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,21.4,712.06,0.0 +gfx942,80,16,7168,1024,torch.float8_e4m3fnuz,ck,15,0,6.8597,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,34.24,1105.85,0.0 +gfx942,80,16,7168,2048,torch.float8_e4m3fnuz,ck,10,0,8.3657,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,56.15,1786.13,0.0 +gfx942,80,16,7168,2304,torch.float8_e4m3fnuz,ck,108,0,9.9995,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,52.85,1678.22,0.0 +gfx942,80,16,7168,4096,torch.float8_e4m3fnuz,ck,10,0,12.6967,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,74.0,2335.65,0.0 +gfx942,80,16,7168,4608,torch.float8_e4m3fnuz,ck,20,0,15.3599,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,68.81,2170.15,0.0 +gfx942,80,16,7168,8192,torch.float8_e4m3fnuz,ck,10,0,22.3668,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,84.01,2641.45,0.0 +gfx942,80,16,7168,9216,torch.float8_e4m3fnuz,ck,6,0,24.8784,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,84.97,2670.47,0.0 +gfx942,80,16,7168,16384,torch.float8_e4m3fnuz,ck,6,0,42.1056,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,7168,18432,torch.float8_e4m3fnuz,ck,6,0,45.8424,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,32,7168,256,torch.float8_e4m3fnuz,ck,75,0,6.8302,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,171.94,337.03,0.0 +gfx942,80,32,7168,512,torch.float8_e4m3fnuz,ck,76,0,6.5317,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,35.96,634.62,0.0 +gfx942,80,32,7168,1024,torch.float8_e4m3fnuz,ck,112,0,8.1404,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,57.71,962.06,0.0 +gfx942,80,32,7168,2048,torch.float8_e4m3fnuz,ck,119,0,10.94,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,85.88,1389.79,0.0 +gfx942,80,32,7168,2304,torch.float8_e4m3fnuz,ck,119,0,11.6614,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,906.38,1461.88,0.0 +gfx942,80,32,7168,4096,torch.float8_e4m3fnuz,ck,119,0,17.0986,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,109.89,1751.6,0.0 +gfx942,80,32,7168,4608,torch.float8_e4m3fnuz,ck,119,0,17.923,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,117.95,1876.71,0.0 +gfx942,80,32,7168,8192,torch.float8_e4m3fnuz,ck,119,0,29.6463,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,126.76,2005.01,0.0 +gfx942,80,32,7168,9216,torch.float8_e4m3fnuz,ck,119,0,31.763,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,133.11,2103.52,0.0 +gfx942,80,32,7168,16384,torch.float8_e4m3fnuz,ck,119,0,53.2367,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1411.84,2224.47,0.0 +gfx942,80,32,7168,18432,torch.float8_e4m3fnuz,ck,119,0,58.5048,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,48,7168,256,torch.float8_e4m3fnuz,ck,75,0,7.0126,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,251.21,361.55,0.0 +gfx942,80,48,7168,2304,torch.float8_e4m3fnuz,ck,113,0,13.6727,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1159.57,1266.3,0.0 +gfx942,80,64,7168,256,torch.float8_e4m3fnuz,ck,75,0,7.6886,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,305.49,360.13,0.0 +gfx942,80,64,7168,512,torch.float8_e4m3fnuz,ck,76,0,8.1001,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,57.99,570.4,0.0 +gfx942,80,64,7168,1024,torch.float8_e4m3fnuz,ck,112,0,10.2444,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,91.71,812.45,0.0 +gfx942,80,64,7168,2048,torch.float8_e4m3fnuz,ck,112,0,14.2819,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1315.69,1101.3,0.0 +gfx942,80,64,7168,2304,torch.float8_e4m3fnuz,ck,112,0,15.5151,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1362.5,1133.09,0.0 +gfx942,80,64,7168,4096,torch.float8_e4m3fnuz,ck,112,0,24.3757,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,154.17,1252.88,0.0 +gfx942,80,64,7168,4608,torch.float8_e4m3fnuz,ck,112,0,27.0894,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,156.07,1264.06,0.0 +gfx942,80,64,7168,8192,torch.float8_e4m3fnuz,ck,112,0,43.5233,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,172.69,1382.3,0.0 +gfx942,80,64,7168,9216,torch.float8_e4m3fnuz,ck,121,0,48.7661,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,173.39,1385.54,0.0 +gfx942,80,64,7168,16384,torch.float8_e4m3fnuz,ck,121,0,81.4496,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1845.61,1466.02,0.0 +gfx942,80,64,7168,18432,torch.float8_e4m3fnuz,ck,121,0,89.7754,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,80,7168,256,torch.float8_e4m3fnuz,ck,75,0,8.2998,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,353.75,361.74,0.0 +gfx942,80,80,7168,2304,torch.float8_e4m3fnuz,ck,113,0,18.9055,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1397.69,943.97,0.0 +gfx942,80,96,7168,256,torch.float8_e4m3fnuz,ck,75,0,8.6194,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,408.75,375.41,0.0 +gfx942,80,96,7168,512,torch.float8_e4m3fnuz,ck,76,0,9.4912,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,74.24,536.86,0.0 +gfx942,80,96,7168,1024,torch.float8_e4m3fnuz,ck,113,0,12.4833,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,112.89,706.11,0.0 +gfx942,80,96,7168,2048,torch.float8_e4m3fnuz,ck,113,0,17.6925,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1593.09,918.63,0.0 +gfx942,80,96,7168,2304,torch.float8_e4m3fnuz,ck,84,0,19.1495,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1655.86,945.85,0.0 +gfx942,80,96,7168,4096,torch.float8_e4m3fnuz,ck,113,0,29.9117,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,188.46,1040.72,0.0 +gfx942,80,96,7168,4608,torch.float8_e4m3fnuz,ck,113,0,33.003,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,192.16,1055.93,0.0 +gfx942,80,96,7168,8192,torch.float8_e4m3fnuz,ck,113,0,53.0736,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,212.43,1147.14,0.0 +gfx942,80,96,7168,9216,torch.float8_e4m3fnuz,ck,113,0,59.3348,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,213.76,1151.45,0.0 +gfx942,80,96,7168,16384,torch.float8_e4m3fnuz,ck,113,0,98.4525,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,2290.3,1222.82,0.0 +gfx942,80,96,7168,18432,torch.float8_e4m3fnuz,ck,113,0,109.8187,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,112,7168,256,torch.float8_e4m3fnuz,ck,75,0,9.207,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,446.44,376.81,0.0 +gfx942,80,112,7168,2304,torch.float8_e4m3fnuz,ck,119,0,21.7835,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1698.25,843.7,0.0 +gfx942,80,128,7168,256,torch.float8_e4m3fnuz,ck,73,0,10.1406,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,463.25,365.14,0.0 +gfx942,80,128,7168,512,torch.float8_e4m3fnuz,ck,84,0,10.7725,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,87.22,517.11,0.0 +gfx942,80,128,7168,1024,torch.float8_e4m3fnuz,ck,119,0,14.7575,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,127.33,630.6,0.0 +gfx942,80,128,7168,2048,torch.float8_e4m3fnuz,ck,119,0,20.5111,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1832.23,817.96,0.0 +gfx942,80,128,7168,2304,torch.float8_e4m3fnuz,ck,84,0,21.9275,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1928.11,850.3,0.0 +gfx942,80,128,7168,4096,torch.float8_e4m3fnuz,ck,119,0,35.2197,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,213.41,900.62,0.0 +gfx942,80,128,7168,4608,torch.float8_e4m3fnuz,ck,119,0,39.2577,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,215.39,903.13,0.0 +gfx942,80,128,7168,8192,torch.float8_e4m3fnuz,ck,119,0,63.878,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,235.33,964.4,0.0 +gfx942,80,128,7168,9216,torch.float8_e4m3fnuz,ck,119,0,71.9358,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,235.09,960.23,0.0 +gfx942,80,128,7168,16384,torch.float8_e4m3fnuz,ck,114,0,119.9291,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2506.88,1012.04,0.0 +gfx942,80,128,7168,18432,torch.float8_e4m3fnuz,ck,114,0,133.6804,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,160,7168,256,torch.float8_e4m3fnuz,ck,75,0,10.251,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,160,7168,512,torch.float8_e4m3fnuz,ck,76,0,12.0125,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,97.77,503.28,0.0 +gfx942,80,160,7168,1024,torch.float8_e4m3fnuz,ck,112,0,17.3598,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,135.3,564.39,0.0 +gfx942,80,160,7168,2048,torch.float8_e4m3fnuz,ck,119,0,25.3574,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1852.56,682.31,0.0 +gfx942,80,160,7168,2304,torch.float8_e4m3fnuz,ck,119,0,27.2747,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,160,7168,4096,torch.float8_e4m3fnuz,ck,133,0,44.638,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,210.48,723.81,0.0 +gfx942,80,160,7168,4608,torch.float8_e4m3fnuz,ck,133,0,49.6981,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,212.68,725.6,0.0 +gfx942,80,160,7168,8192,torch.float8_e4m3fnuz,ck,133,0,81.2116,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,231.38,767.44,0.0 +gfx942,80,160,7168,9216,torch.float8_e4m3fnuz,ck,133,0,90.3778,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,233.9,772.63,0.0 +gfx942,80,160,7168,16384,torch.float8_e4m3fnuz,ck,136,0,150.746,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2493.0,811.67,0.0 +gfx942,80,160,7168,18432,torch.float8_e4m3fnuz,ck,136,0,168.4653,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,192,7168,256,torch.float8_e4m3fnuz,ck,73,0,10.5659,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,192,7168,512,torch.float8_e4m3fnuz,ck,84,0,13.4126,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,105.07,486.17,0.0 +gfx942,80,192,7168,1024,torch.float8_e4m3fnuz,ck,84,0,19.8942,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,141.68,517.19,0.0 +gfx942,80,192,7168,2048,torch.float8_e4m3fnuz,ck,120,0,28.1354,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2003.58,633.57,0.0 +gfx942,80,192,7168,2304,torch.float8_e4m3fnuz,ck,120,0,29.5015,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,192,7168,4096,torch.float8_e4m3fnuz,ck,120,0,48.3634,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,233.12,680.25,0.0 +gfx942,80,192,7168,4608,torch.float8_e4m3fnuz,ck,120,0,53.0886,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,238.91,690.68,0.0 +gfx942,80,192,7168,8192,torch.float8_e4m3fnuz,ck,120,0,87.1718,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,258.67,723.23,0.0 +gfx942,80,192,7168,9216,torch.float8_e4m3fnuz,ck,120,0,96.3409,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,263.31,732.63,0.0 +gfx942,80,192,7168,16384,torch.float8_e4m3fnuz,ck,120,0,162.2601,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2779.31,760.13,0.0 +gfx942,80,192,7168,18432,torch.float8_e4m3fnuz,ck,120,0,181.5802,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,224,7168,256,torch.float8_e4m3fnuz,ck,73,0,10.9003,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,224,7168,512,torch.float8_e4m3fnuz,ck,84,0,14.9991,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,109.62,466.43,0.0 +gfx942,80,224,7168,1024,torch.float8_e4m3fnuz,ck,84,0,20.7816,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,158.23,518.76,0.0 +gfx942,80,224,7168,2048,torch.float8_e4m3fnuz,ck,85,0,30.5218,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2154.74,601.21,0.0 +gfx942,80,224,7168,2304,torch.float8_e4m3fnuz,ck,85,0,32.8835,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,224,7168,4096,torch.float8_e4m3fnuz,ck,85,0,54.7818,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,240.1,611.31,0.0 +gfx942,80,224,7168,4608,torch.float8_e4m3fnuz,ck,85,0,60.3092,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,245.36,618.04,0.0 +gfx942,80,224,7168,8192,torch.float8_e4m3fnuz,ck,85,0,98.7952,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,266.27,645.44,0.0 +gfx942,80,224,7168,9216,torch.float8_e4m3fnuz,ck,85,0,110.1364,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,268.71,647.71,0.0 +gfx942,80,224,7168,16384,torch.float8_e4m3fnuz,ck,85,0,185.5598,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2835.39,669.98,0.0 +gfx942,80,224,7168,18432,torch.float8_e4m3fnuz,ck,85,0,207.6011,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,256,7168,256,torch.float8_e4m3fnuz,ck,73,0,11.5078,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,816.42,484.07,0.0 +gfx942,80,256,7168,512,torch.float8_e4m3fnuz,ck,85,0,16.1374,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,116.44,462.97,0.0 +gfx942,80,256,7168,1024,torch.float8_e4m3fnuz,ck,85,0,21.584,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,174.11,522.25,0.0 +gfx942,80,256,7168,2048,torch.float8_e4m3fnuz,ck,85,0,30.5235,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2462.43,618.36,0.0 +gfx942,80,256,7168,2304,torch.float8_e4m3fnuz,ck,85,0,32.0436,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2638.82,648.33,0.0 +gfx942,80,256,7168,4096,torch.float8_e4m3fnuz,ck,85,0,54.0567,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,278.09,630.43,0.0 +gfx942,80,256,7168,4608,torch.float8_e4m3fnuz,ck,85,0,60.5172,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,279.45,625.93,0.0 +gfx942,80,256,7168,8192,torch.float8_e4m3fnuz,ck,85,0,98.4777,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,305.3,654.84,0.0 +gfx942,80,256,7168,9216,torch.float8_e4m3fnuz,ck,85,0,109.9894,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,307.51,655.42,0.0 +gfx942,80,256,7168,16384,torch.float8_e4m3fnuz,ck,85,0,186.0958,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3231.11,673.34,0.0 +gfx942,80,256,7168,18432,torch.float8_e4m3fnuz,ck,85,0,209.5107,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,288,7168,256,torch.float8_e4m3fnuz,ck,72,0,12.7115,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,288,7168,512,torch.float8_e4m3fnuz,ck,76,0,17.9905,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,117.5,441.69,0.0 +gfx942,80,288,7168,1024,torch.float8_e4m3fnuz,ck,84,0,26.2028,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,161.35,448.95,0.0 +gfx942,80,288,7168,2048,torch.float8_e4m3fnuz,ck,85,0,38.1402,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2217.01,508.61,0.0 +gfx942,80,288,7168,2304,torch.float8_e4m3fnuz,ck,85,0,40.9768,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,288,7168,4096,torch.float8_e4m3fnuz,ck,85,0,69.139,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,244.6,501.43,0.0 +gfx942,80,288,7168,4608,torch.float8_e4m3fnuz,ck,85,0,76.8637,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,247.52,500.7,0.0 +gfx942,80,288,7168,8192,torch.float8_e4m3fnuz,ck,85,0,127.535,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,265.2,511.3,0.0 +gfx942,80,288,7168,9216,torch.float8_e4m3fnuz,ck,85,0,142.1807,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,267.62,512.33,0.0 +gfx942,80,288,7168,16384,torch.float8_e4m3fnuz,ck,85,0,243.1421,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2782.15,519.4,0.0 +gfx942,80,288,7168,18432,torch.float8_e4m3fnuz,ck,85,0,272.1414,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,320,7168,256,torch.float8_e4m3fnuz,ck,74,0,13.0231,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,320,7168,512,torch.float8_e4m3fnuz,ck,84,0,19.0693,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,123.17,441.62,0.0 +gfx942,80,320,7168,1024,torch.float8_e4m3fnuz,ck,85,0,26.7558,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,175.57,458.04,0.0 +gfx942,80,320,7168,2048,torch.float8_e4m3fnuz,ck,85,0,38.8094,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2420.87,513.35,0.0 +gfx942,80,320,7168,2304,torch.float8_e4m3fnuz,ck,85,0,40.2392,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,320,7168,4096,torch.float8_e4m3fnuz,ck,85,0,70.3183,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,267.22,501.41,0.0 +gfx942,80,320,7168,4608,torch.float8_e4m3fnuz,ck,85,0,79.5688,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,265.67,491.3,0.0 +gfx942,80,320,7168,8192,torch.float8_e4m3fnuz,ck,85,0,131.2114,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,286.42,502.47,0.0 +gfx942,80,320,7168,9216,torch.float8_e4m3fnuz,ck,85,0,146.2403,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,289.1,503.26,0.0 +gfx942,80,320,7168,16384,torch.float8_e4m3fnuz,ck,101,0,239.9889,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3131.89,530.32,0.0 +gfx942,80,320,7168,18432,torch.float8_e4m3fnuz,ck,101,0,270.4958,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,352,7168,256,torch.float8_e4m3fnuz,ck,75,0,15.1695,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,352,7168,512,torch.float8_e4m3fnuz,ck,84,0,20.5103,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,125.97,433.76,0.0 +gfx942,80,352,7168,1024,torch.float8_e4m3fnuz,ck,84,0,30.2515,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,170.81,421.36,0.0 +gfx942,80,352,7168,2048,torch.float8_e4m3fnuz,ck,100,0,45.5887,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,2266.96,448.52,0.0 +gfx942,80,352,7168,2304,torch.float8_e4m3fnuz,ck,100,0,49.1372,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,352,7168,4096,torch.float8_e4m3fnuz,ck,133,0,83.0217,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,248.97,431.79,0.0 +gfx942,80,352,7168,4608,torch.float8_e4m3fnuz,ck,133,0,92.2196,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,252.15,430.48,0.0 +gfx942,80,352,7168,8192,torch.float8_e4m3fnuz,ck,86,0,154.334,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,267.85,431.86,0.0 +gfx942,80,352,7168,9216,torch.float8_e4m3fnuz,ck,133,0,171.5389,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,271.11,433.43,0.0 +gfx942,80,352,7168,16384,torch.float8_e4m3fnuz,ck,86,0,289.7112,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2853.81,442.7,0.0 +gfx942,80,352,7168,18432,torch.float8_e4m3fnuz,ck,86,0,324.6944,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,384,7168,256,torch.float8_e4m3fnuz,ck,72,0,15.1515,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,384,7168,512,torch.float8_e4m3fnuz,ck,84,0,21.3082,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,132.28,439.81,0.0 +gfx942,80,384,7168,1024,torch.float8_e4m3fnuz,ck,84,0,31.9035,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,176.69,414.95,0.0 +gfx942,80,384,7168,2048,torch.float8_e4m3fnuz,ck,86,0,45.0074,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2504.99,465.96,0.0 +gfx942,80,384,7168,2304,torch.float8_e4m3fnuz,ck,86,0,48.0036,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,384,7168,4096,torch.float8_e4m3fnuz,ck,86,0,80.811,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,279.03,450.9,0.0 +gfx942,80,384,7168,4608,torch.float8_e4m3fnuz,ck,86,0,88.8226,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,285.59,453.77,0.0 +gfx942,80,384,7168,8192,torch.float8_e4m3fnuz,ck,86,0,144.2396,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,312.65,467.08,0.0 +gfx942,80,384,7168,9216,torch.float8_e4m3fnuz,ck,86,0,160.4438,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,316.21,468.1,0.0 +gfx942,80,384,7168,16384,torch.float8_e4m3fnuz,ck,86,0,268.8438,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3354.9,480.71,0.0 +gfx942,80,384,7168,18432,torch.float8_e4m3fnuz,ck,86,0,300.0663,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,7168,256,torch.float8_e4m3fnuz,ck,74,0,15.9059,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,416,7168,512,torch.float8_e4m3fnuz,ck,84,0,22.965,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,132.96,428.77,0.0 +gfx942,80,416,7168,1024,torch.float8_e4m3fnuz,ck,85,0,32.8392,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,185.96,418.09,0.0 +gfx942,80,416,7168,2048,torch.float8_e4m3fnuz,ck,85,0,48.1552,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,7168,2304,torch.float8_e4m3fnuz,ck,85,0,51.2564,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,7168,4096,torch.float8_e4m3fnuz,ck,85,0,88.4594,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,276.15,418.59,0.0 +gfx942,80,416,7168,4608,torch.float8_e4m3fnuz,ck,85,0,99.396,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,276.48,411.59,0.0 +gfx942,80,416,7168,8192,torch.float8_e4m3fnuz,ck,85,0,163.693,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,298.46,415.97,0.0 +gfx942,80,416,7168,9216,torch.float8_e4m3fnuz,ck,85,0,183.5982,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,299.36,413.17,0.0 +gfx942,80,416,7168,16384,torch.float8_e4m3fnuz,ck,85,0,310.2738,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,7168,18432,torch.float8_e4m3fnuz,ck,85,0,352.9218,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,7168,256,torch.float8_e4m3fnuz,ck,72,0,15.6539,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,448,7168,512,torch.float8_e4m3fnuz,ck,84,0,23.4146,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,140.44,440.83,0.0 +gfx942,80,448,7168,1024,torch.float8_e4m3fnuz,ck,85,0,32.9498,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,199.6,431.61,0.0 +gfx942,80,448,7168,2048,torch.float8_e4m3fnuz,ck,85,0,47.8488,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,7168,2304,torch.float8_e4m3fnuz,ck,85,0,49.72,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,7168,4096,torch.float8_e4m3fnuz,ck,85,0,89.2889,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,294.62,421.3,0.0 +gfx942,80,448,7168,4608,torch.float8_e4m3fnuz,ck,85,0,99.4928,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,297.46,417.29,0.0 +gfx942,80,448,7168,8192,torch.float8_e4m3fnuz,ck,85,0,164.1085,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.6,419.31,0.0 +gfx942,80,448,7168,9216,torch.float8_e4m3fnuz,ck,85,0,183.9126,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.84,416.57,0.0 +gfx942,80,448,7168,16384,torch.float8_e4m3fnuz,ck,85,0,316.355,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,7168,18432,torch.float8_e4m3fnuz,ck,85,0,356.7658,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,7168,256,torch.float8_e4m3fnuz,ck,72,0,16.1387,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,480,7168,512,torch.float8_e4m3fnuz,ck,72,0,25.9264,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,135.89,416.45,0.0 +gfx942,80,480,7168,1024,torch.float8_e4m3fnuz,ck,72,0,36.9129,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,190.89,398.58,0.0 +gfx942,80,480,7168,2048,torch.float8_e4m3fnuz,ck,102,0,52.4824,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,7168,2304,torch.float8_e4m3fnuz,ck,102,0,56.9516,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,7168,4096,torch.float8_e4m3fnuz,ck,102,0,96.7743,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,291.25,394.81,0.0 +gfx942,80,480,7168,4608,torch.float8_e4m3fnuz,ck,102,0,106.4225,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,297.95,395.81,0.0 +gfx942,80,480,7168,8192,torch.float8_e4m3fnuz,ck,102,0,176.1263,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.06,394.79,0.0 +gfx942,80,480,7168,9216,torch.float8_e4m3fnuz,ck,102,0,194.8844,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,325.41,396.98,0.0 +gfx942,80,480,7168,16384,torch.float8_e4m3fnuz,ck,86,0,346.0471,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,7168,18432,torch.float8_e4m3fnuz,ck,102,0,379.6631,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,7168,256,torch.float8_e4m3fnuz,ck,72,0,16.1951,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1160.26,574.63,0.0 +gfx942,80,512,7168,512,torch.float8_e4m3fnuz,ck,72,0,26.2156,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,143.35,429.98,0.0 +gfx942,80,512,7168,1024,torch.float8_e4m3fnuz,ck,71,0,37.0911,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,202.64,409.92,0.0 +gfx942,80,512,7168,2048,torch.float8_e4m3fnuz,ck,71,0,53.438,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2813.05,431.69,0.0 +gfx942,80,512,7168,2304,torch.float8_e4m3fnuz,ck,72,0,57.4769,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2942.3,435.56,0.0 +gfx942,80,512,7168,4096,torch.float8_e4m3fnuz,ck,71,0,99.0962,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,303.39,391.51,0.0 +gfx942,80,512,7168,4608,torch.float8_e4m3fnuz,ck,71,0,109.7596,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,308.15,389.3,0.0 +gfx942,80,512,7168,8192,torch.float8_e4m3fnuz,ck,71,0,182.1732,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,330.07,385.65,0.0 +gfx942,80,512,7168,9216,torch.float8_e4m3fnuz,ck,71,0,204.5549,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,330.7,381.9,0.0 +gfx942,80,512,7168,16384,torch.float8_e4m3fnuz,ck,71,0,346.2792,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,7168,18432,torch.float8_e4m3fnuz,ck,70,0,389.6687,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1024,7168,256,torch.float8_e4m3fnuz,ck,71,0,27.7779,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1352.91,603.98,0.0 +gfx942,80,1024,7168,512,torch.float8_e4m3fnuz,ck,71,0,46.6251,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,161.2,404.81,0.0 +gfx942,80,1024,7168,1024,torch.float8_e4m3fnuz,ck,71,0,66.699,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,225.38,345.86,0.0 +gfx942,80,1024,7168,2048,torch.float8_e4m3fnuz,ck,71,0,98.0567,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3066.06,320.81,0.0 +gfx942,80,1024,7168,2304,torch.float8_e4m3fnuz,ck,85,0,107.3839,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3149.71,312.47,0.0 +gfx942,80,1024,7168,4096,torch.float8_e4m3fnuz,ck,71,0,186.692,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.08,258.36,0.0 +gfx942,80,1024,7168,4608,torch.float8_e4m3fnuz,ck,71,0,208.1557,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,324.98,251.87,0.0 +gfx942,80,1024,7168,8192,torch.float8_e4m3fnuz,ck,85,0,349.1505,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,344.43,234.25,0.0 +gfx942,80,1024,7168,9216,torch.float8_e4m3fnuz,ck,85,0,390.4976,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,346.46,230.93,0.0 +gfx942,80,1024,7168,16384,torch.float8_e4m3fnuz,ck,85,0,661.406,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1024,7168,18432,torch.float8_e4m3fnuz,ck,85,0,745.9611,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1536,7168,256,torch.float8_e4m3fnuz,ck,71,0,37.3284,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1510.15,649.59,0.0 +gfx942,80,1536,7168,2048,torch.float8_e4m3fnuz,ck,85,0,138.0452,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3266.84,288.64,0.0 +gfx942,80,1536,7168,2304,torch.float8_e4m3fnuz,ck,85,0,150.2725,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3376.15,279.99,0.0 +gfx942,80,2048,7168,256,torch.float8_e4m3fnuz,ck,71,0,48.0096,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1565.56,660.69,0.0 +gfx942,80,2048,7168,512,torch.float8_e4m3fnuz,ck,71,0,82.6364,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,181.91,412.39,0.0 +gfx942,80,2048,7168,1024,torch.float8_e4m3fnuz,ck,85,0,122.5939,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,245.24,316.47,0.0 +gfx942,80,2048,7168,2048,torch.float8_e4m3fnuz,ck,85,0,186.7051,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3220.56,258.35,0.0 +gfx942,80,2048,7168,2304,torch.float8_e4m3fnuz,ck,85,0,199.0359,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3398.67,254.19,0.0 +gfx942,80,2048,7168,4096,torch.float8_e4m3fnuz,ck,85,0,351.1592,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,342.46,191.11,0.0 +gfx942,80,2048,7168,4608,torch.float8_e4m3fnuz,ck,85,0,390.2145,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,346.71,184.07,0.0 +gfx942,80,2048,7168,8192,torch.float8_e4m3fnuz,ck,85,0,657.8671,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.6,159.39,0.0 +gfx942,80,2048,7168,9216,torch.float8_e4m3fnuz,ck,85,0,738.9236,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,366.19,154.68,0.0 +gfx942,80,2048,7168,16384,torch.float8_e4m3fnuz,ck,85,0,1256.6524,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,2048,7168,18432,torch.float8_e4m3fnuz,ck,85,0,1411.2909,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,4096,7168,256,torch.float8_e4m3fnuz,ck,71,0,85.7258,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1753.54,718.61,0.0 +gfx942,80,4096,7168,512,torch.float8_e4m3fnuz,ck,71,0,151.6161,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,198.3,425.33,0.0 +gfx942,80,4096,7168,1024,torch.float8_e4m3fnuz,ck,71,0,228.1081,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,263.6,307.99,0.0 +gfx942,80,4096,7168,2048,torch.float8_e4m3fnuz,ck,71,0,345.3094,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3482.65,236.86,0.0 +gfx942,80,4096,7168,2304,torch.float8_e4m3fnuz,ck,71,0,379.4759,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3565.22,223.13,0.0 +gfx942,80,4096,7168,4096,torch.float8_e4m3fnuz,ck,85,0,678.039,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,354.73,154.65,0.0 +gfx942,80,4096,7168,4608,torch.float8_e4m3fnuz,ck,85,0,756.3913,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,357.73,146.25,0.0 +gfx942,80,4096,7168,8192,torch.float8_e4m3fnuz,ck,85,0,1274.608,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.4,118.46,0.0 +gfx942,80,4096,7168,9216,torch.float8_e4m3fnuz,ck,85,0,1433.1369,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.61,113.41,0.0 +gfx942,80,4096,7168,16384,torch.float8_e4m3fnuz,ck,85,0,2443.998,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,4096,7168,18432,torch.float8_e4m3fnuz,ck,85,0,2773.2274,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,7168,256,torch.float8_e4m3fnuz,ck,71,0,123.4161,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,7168,512,torch.float8_e4m3fnuz,ck,71,0,215.2661,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,209.49,440.83,0.0 +gfx942,80,6144,7168,1024,torch.float8_e4m3fnuz,ck,71,0,327.7242,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,275.21,310.36,0.0 +gfx942,80,6144,7168,2048,torch.float8_e4m3fnuz,ck,71,0,509.0789,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,7168,2304,torch.float8_e4m3fnuz,ck,71,0,560.4929,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,7168,4096,torch.float8_e4m3fnuz,ck,71,0,996.6903,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,361.98,143.08,0.0 +gfx942,80,6144,7168,4608,torch.float8_e4m3fnuz,ck,71,0,1121.7892,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,361.81,133.2,0.0 +gfx942,80,6144,7168,8192,torch.float8_e4m3fnuz,ck,71,0,1890.6064,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.65,104.27,0.0 +gfx942,80,6144,7168,9216,torch.float8_e4m3fnuz,ck,71,0,2131.2878,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,380.87,98.89,0.0 +gfx942,80,6144,7168,16384,torch.float8_e4m3fnuz,ck,85,0,3671.969,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,7168,18432,torch.float8_e4m3fnuz,ck,85,0,4139.4899,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,7168,256,torch.float8_e4m3fnuz,ck,71,0,158.8437,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1892.73,764.1,0.0 +gfx942,80,8192,7168,512,torch.float8_e4m3fnuz,ck,71,0,284.9791,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,211.0,439.7,0.0 +gfx942,80,8192,7168,1024,torch.float8_e4m3fnuz,ck,71,0,434.723,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,276.63,306.33,0.0 +gfx942,80,8192,7168,2048,torch.float8_e4m3fnuz,ck,71,0,665.8308,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3612.3,223.63,0.0 +gfx942,80,8192,7168,2304,torch.float8_e4m3fnuz,ck,71,0,736.4592,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3674.11,207.52,0.0 +gfx942,80,8192,7168,4096,torch.float8_e4m3fnuz,ck,71,0,1320.5151,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,364.28,136.58,0.0 +gfx942,80,8192,7168,4608,torch.float8_e4m3fnuz,ck,71,0,1479.7426,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.72,127.2,0.0 +gfx942,80,8192,7168,8192,torch.float8_e4m3fnuz,ck,71,0,2510.9482,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,383.15,96.88,0.0 +gfx942,80,8192,7168,9216,torch.float8_e4m3fnuz,ck,71,0,2849.736,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.8,90.89,0.0 +gfx942,80,8192,7168,16384,torch.float8_e4m3fnuz,ck,85,0,4942.7227,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,7168,18432,torch.float8_e4m3fnuz,ck,85,0,5501.8021,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,10240,7168,256,torch.float8_e4m3fnuz,ck,71,0,199.187,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,10240,7168,512,torch.float8_e4m3fnuz,ck,71,0,352.3613,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,213.31,441.91,0.0 +gfx942,80,10240,7168,1024,torch.float8_e4m3fnuz,ck,71,0,534.3848,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,281.3,308.07,0.0 +gfx942,80,10240,7168,2048,torch.float8_e4m3fnuz,ck,71,0,843.0746,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,10240,7168,2304,torch.float8_e4m3fnuz,ck,71,0,913.7132,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,10240,7168,4096,torch.float8_e4m3fnuz,ck,71,0,1650.4804,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,364.32,132.15,0.0 +gfx942,80,10240,7168,4608,torch.float8_e4m3fnuz,ck,71,0,1837.7562,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,368.09,123.53,0.0 +gfx942,80,10240,7168,8192,torch.float8_e4m3fnuz,ck,85,0,3170.8672,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.26,91.27,0.0 +gfx942,80,10240,7168,9216,torch.float8_e4m3fnuz,ck,85,0,3555.6005,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,380.5,86.41,0.0 +gfx942,80,10240,7168,16384,torch.float8_e4m3fnuz,ck,85,0,6072.4257,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,10240,7168,18432,torch.float8_e4m3fnuz,ck,85,0,6834.6666,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,7168,256,torch.float8_e4m3fnuz,ck,71,0,243.398,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,7168,512,torch.float8_e4m3fnuz,ck,71,0,420.7005,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,214.39,442.41,0.0 +gfx942,80,12288,7168,1024,torch.float8_e4m3fnuz,ck,71,0,642.9593,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,280.56,304.97,0.0 +gfx942,80,12288,7168,2048,torch.float8_e4m3fnuz,ck,71,0,1001.5776,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,7168,2304,torch.float8_e4m3fnuz,ck,72,0,1134.7441,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,12288,7168,4096,torch.float8_e4m3fnuz,ck,71,0,1986.0405,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,363.31,128.83,0.0 +gfx942,80,12288,7168,4608,torch.float8_e4m3fnuz,ck,102,0,2231.5819,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,363.75,119.11,0.0 +gfx942,80,12288,7168,8192,torch.float8_e4m3fnuz,ck,102,0,3769.9628,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,382.79,89.0,0.0 +gfx942,80,12288,7168,9216,torch.float8_e4m3fnuz,ck,102,0,4210.5442,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,385.58,84.42,0.0 +gfx942,80,12288,7168,16384,torch.float8_e4m3fnuz,ck,102,0,7272.8109,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,7168,18432,torch.float8_e4m3fnuz,ck,102,0,8054.046,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,7168,256,torch.float8_e4m3fnuz,ck,72,0,297.7928,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,14336,7168,512,torch.float8_e4m3fnuz,ck,71,0,486.4844,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,216.3,445.09,0.0 +gfx942,80,14336,7168,1024,torch.float8_e4m3fnuz,ck,71,0,743.1281,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,283.2,306.19,0.0 +gfx942,80,14336,7168,2048,torch.float8_e4m3fnuz,ck,74,0,1199.9511,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,14336,7168,2304,torch.float8_e4m3fnuz,ck,71,0,1310.3925,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,7168,4096,torch.float8_e4m3fnuz,ck,71,0,2319.2645,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,362.97,126.59,0.0 +gfx942,80,14336,7168,4608,torch.float8_e4m3fnuz,ck,71,0,2586.1715,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,366.19,117.78,0.0 +gfx942,80,14336,7168,8192,torch.float8_e4m3fnuz,ck,85,0,4407.5338,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.99,86.6,0.0 +gfx942,80,14336,7168,9216,torch.float8_e4m3fnuz,ck,85,0,4962.8544,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.65,81.34,0.0 +gfx942,80,14336,7168,16384,torch.float8_e4m3fnuz,ck,85,0,8455.8581,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,7168,18432,torch.float8_e4m3fnuz,ck,85,0,9585.3754,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,7168,256,torch.float8_e4m3fnuz,ck,71,0,306.2344,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1963.51,786.69,0.0 +gfx942,80,16384,7168,512,torch.float8_e4m3fnuz,ck,71,0,558.1976,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,215.44,442.39,0.0 +gfx942,80,16384,7168,1024,torch.float8_e4m3fnuz,ck,71,0,852.9241,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,281.99,303.66,0.0 +gfx942,80,16384,7168,2048,torch.float8_e4m3fnuz,ck,71,0,1322.9659,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3636.04,214.0,0.0 +gfx942,80,16384,7168,2304,torch.float8_e4m3fnuz,ck,71,0,1464.8949,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3694.23,197.38,0.0 +gfx942,80,16384,7168,4096,torch.float8_e4m3fnuz,ck,71,0,2629.6553,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.86,126.01,0.0 +gfx942,80,16384,7168,4608,torch.float8_e4m3fnuz,ck,71,0,2948.8372,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,367.04,116.46,0.0 +gfx942,80,16384,7168,8192,torch.float8_e4m3fnuz,ck,71,0,5047.6593,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.2,84.76,0.0 +gfx942,80,16384,7168,9216,torch.float8_e4m3fnuz,ck,85,0,5684.8825,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,380.78,79.5,0.0 +gfx942,80,16384,7168,16384,torch.float8_e4m3fnuz,ck,85,0,9679.208,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,7168,18432,torch.float8_e4m3fnuz,ck,85,0,10922.6859,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,20480,7168,256,torch.float8_e4m3fnuz,ck,71,0,381.5807,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1969.75,787.98,0.0 +gfx942,80,20480,7168,2048,torch.float8_e4m3fnuz,ck,71,0,1639.9077,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3666.64,213.56,0.0 +gfx942,80,20480,7168,2304,torch.float8_e4m3fnuz,ck,71,0,1805.4245,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3746.8,197.9,0.0 +gfx942,80,32768,7168,256,torch.float8_e4m3fnuz,ck,71,0,600.9077,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2001.29,798.77,0.0 +gfx942,80,32768,7168,512,torch.float8_e4m3fnuz,ck,71,0,1096.744,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,219.3,446.97,0.0 +gfx942,80,32768,7168,1024,torch.float8_e4m3fnuz,ck,71,0,1682.4154,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,285.92,303.53,0.0 +gfx942,80,32768,7168,2048,torch.float8_e4m3fnuz,ck,72,0,2704.1764,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,32768,7168,2304,torch.float8_e4m3fnuz,ck,71,0,2907.7206,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3722.27,193.2,0.0 +gfx942,80,32768,7168,4096,torch.float8_e4m3fnuz,ck,71,0,5253.8807,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,366.23,120.55,0.0 +gfx942,80,32768,7168,4608,torch.float8_e4m3fnuz,ck,71,0,5865.1077,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,369.07,111.47,0.0 +gfx942,80,32768,7168,8192,torch.float8_e4m3fnuz,ck,71,0,10111.3215,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,380.59,78.81,0.0 +gfx942,80,32768,7168,9216,torch.float8_e4m3fnuz,ck,85,0,11337.9808,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.84,73.89,0.0 +gfx942,80,32768,7168,16384,torch.float8_e4m3fnuz,ck,85,0,19294.1288,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,32768,7168,18432,torch.float8_e4m3fnuz,ck,85,0,21856.5731,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,7168,256,torch.float8_e4m3fnuz,ck,71,0,1212.7558,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,7168,512,torch.float8_e4m3fnuz,ck,71,0,2167.8734,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,221.89,450.56,0.0 +gfx942,80,65536,7168,1024,torch.float8_e4m3fnuz,ck,71,0,3378.6011,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,284.75,300.12,0.0 +gfx942,80,65536,7168,2048,torch.float8_e4m3fnuz,ck,71,0,5287.9696,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,7168,2304,torch.float8_e4m3fnuz,ck,71,0,5871.5636,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,7168,4096,torch.float8_e4m3fnuz,ck,71,0,10540.0481,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.11,117.39,0.0 +gfx942,80,65536,7168,4608,torch.float8_e4m3fnuz,ck,71,0,11720.6642,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,369.38,108.74,0.0 +gfx942,80,65536,7168,8192,torch.float8_e4m3fnuz,ck,85,0,20110.6143,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,382.71,76.33,0.0 +gfx942,80,65536,7168,9216,torch.float8_e4m3fnuz,ck,85,0,22625.9524,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,382.69,71.14,0.0 +gfx942,80,65536,7168,16384,torch.float8_e4m3fnuz,ck,85,0,38748.1812,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,7168,18432,torch.float8_e4m3fnuz,ck,85,0,43467.0784,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,7168,256,torch.float8_e4m3fnuz,ck,72,0,1997.5301,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,98304,7168,512,torch.float8_e4m3fnuz,ck,71,0,3241.0898,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,222.63,451.48,0.0 +gfx942,80,98304,7168,1024,torch.float8_e4m3fnuz,ck,71,0,5048.0903,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,285.87,300.57,0.0 +gfx942,80,98304,7168,2048,torch.float8_e4m3fnuz,ck,72,0,8086.5215,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,98304,7168,2304,torch.float8_e4m3fnuz,ck,71,0,8879.3992,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,7168,4096,torch.float8_e4m3fnuz,ck,71,0,15801.8859,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,365.3,116.52,0.0 +gfx942,80,98304,7168,4608,torch.float8_e4m3fnuz,ck,71,0,17582.8246,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,369.34,107.79,0.0 +gfx942,80,98304,7168,8192,torch.float8_e4m3fnuz,ck,102,0,29902.9106,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,386.08,76.02,0.0 +gfx942,80,98304,7168,9216,torch.float8_e4m3fnuz,ck,102,0,33489.9388,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,387.82,71.11,0.0 +gfx942,80,98304,7168,16384,torch.float8_e4m3fnuz,ck,102,0,57025.1637,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,7168,18432,torch.float8_e4m3fnuz,ck,102,0,63990.1906,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,7168,256,torch.float8_e4m3fnuz,ck,71,0,2444.3347,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,7168,2048,torch.float8_e4m3fnuz,ck,71,0,10519.7293,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,7168,2304,torch.float8_e4m3fnuz,ck,71,0,11621.1373,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,7168,8192,torch.float8_e4m3fnuz,ck,85,0,40106.8483,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,383.8,75.09,0.0 +gfx942,80,131072,7168,16384,torch.float8_e4m3fnuz,ck,85,0,77358.0635,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,7168,18432,torch.float8_e4m3fnuz,ck,85,0,87126.4572,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1,8192,512,torch.float8_e4m3fnuz,ck,9,0,5.7689,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.45,729.98,0.0 +gfx942,80,1,8192,1024,torch.float8_e4m3fnuz,ck,15,0,6.4482,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,26.02,1303.62,0.0 +gfx942,80,1,8192,1536,torch.float8_e4m3fnuz,ck,11,0,7.6538,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,8192,512,torch.float8_e4m3fnuz,ck,9,0,5.8077,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.89,728.02,0.0 +gfx942,80,2,8192,1536,torch.float8_e4m3fnuz,ck,11,0,7.5144,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,6.7,1679.28,0.0 +gfx942,80,4,8192,512,torch.float8_e4m3fnuz,ck,9,0,5.8024,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.78,734.5,0.0 +gfx942,80,4,8192,1536,torch.float8_e4m3fnuz,ck,6,0,7.6354,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,13.18,1657.36,0.0 +gfx942,80,8,8192,512,torch.float8_e4m3fnuz,ck,9,0,5.8225,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,11.53,743.58,0.0 +gfx942,80,8,8192,1536,torch.float8_e4m3fnuz,ck,108,0,7.9074,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,25.46,1609.41,0.0 +gfx942,80,16,8192,512,torch.float8_e4m3fnuz,ck,9,0,5.7209,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,23.46,780.41,0.0 +gfx942,80,16,8192,1536,torch.float8_e4m3fnuz,ck,108,0,7.8973,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,50.99,1629.62,0.0 +gfx942,80,32,8192,512,torch.float8_e4m3fnuz,ck,76,0,6.5249,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,41.14,725.68,0.0 +gfx942,80,32,8192,1024,torch.float8_e4m3fnuz,ck,119,0,7.4863,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,717.14,1194.94,0.0 +gfx942,80,32,8192,1536,torch.float8_e4m3fnuz,ck,112,0,9.686,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,83.14,1358.29,0.0 +gfx942,80,64,8192,512,torch.float8_e4m3fnuz,ck,76,0,8.7017,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,61.7,606.28,0.0 +gfx942,80,64,8192,1024,torch.float8_e4m3fnuz,ck,114,0,10.9842,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,977.53,865.13,0.0 +gfx942,80,64,8192,1536,torch.float8_e4m3fnuz,ck,114,0,12.7978,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,96,8192,512,torch.float8_e4m3fnuz,ck,76,0,9.814,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,82.06,592.66,0.0 +gfx942,80,96,8192,1536,torch.float8_e4m3fnuz,ck,120,0,16.253,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,128,8192,512,torch.float8_e4m3fnuz,ck,76,0,11.8488,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,90.62,536.51,0.0 +gfx942,80,128,8192,1024,torch.float8_e4m3fnuz,ck,121,0,14.9703,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1434.5,709.19,0.0 +gfx942,80,128,8192,1536,torch.float8_e4m3fnuz,ck,121,0,19.1559,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,160,8192,512,torch.float8_e4m3fnuz,ck,84,0,12.6363,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,106.22,545.86,0.0 +gfx942,80,160,8192,1536,torch.float8_e4m3fnuz,ck,100,0,21.719,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,192,8192,512,torch.float8_e4m3fnuz,ck,84,0,14.7407,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,109.26,504.61,0.0 +gfx942,80,192,8192,1024,torch.float8_e4m3fnuz,ck,85,0,18.7963,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1713.76,624.11,0.0 +gfx942,80,192,8192,1536,torch.float8_e4m3fnuz,ck,86,0,24.7435,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,224,8192,512,torch.float8_e4m3fnuz,ck,76,0,16.6705,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,112.72,478.63,0.0 +gfx942,80,224,8192,1536,torch.float8_e4m3fnuz,ck,100,0,28.8459,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,256,8192,512,torch.float8_e4m3fnuz,ck,76,0,18.1586,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,118.26,469.18,0.0 +gfx942,80,256,8192,1024,torch.float8_e4m3fnuz,ck,70,0,23.9267,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1795.05,536.85,0.0 +gfx942,80,256,8192,1536,torch.float8_e4m3fnuz,ck,85,0,30.3059,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,288,8192,512,torch.float8_e4m3fnuz,ck,85,0,19.4782,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,124.03,465.15,0.0 +gfx942,80,288,8192,1536,torch.float8_e4m3fnuz,ck,85,0,32.0963,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,320,8192,512,torch.float8_e4m3fnuz,ck,85,0,19.6127,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,136.87,489.53,0.0 +gfx942,80,320,8192,1024,torch.float8_e4m3fnuz,ck,85,0,24.7583,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2168.45,563.82,0.0 +gfx942,80,320,8192,1536,torch.float8_e4m3fnuz,ck,101,0,32.6299,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,352,8192,512,torch.float8_e4m3fnuz,ck,84,0,22.5512,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,130.94,449.72,0.0 +gfx942,80,352,8192,1536,torch.float8_e4m3fnuz,ck,85,0,38.7583,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,384,8192,512,torch.float8_e4m3fnuz,ck,84,0,23.7469,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,135.65,449.84,0.0 +gfx942,80,384,8192,1536,torch.float8_e4m3fnuz,ck,85,0,38.4623,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,8192,512,torch.float8_e4m3fnuz,ck,84,0,25.8591,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,134.95,434.01,0.0 +gfx942,80,416,8192,1536,torch.float8_e4m3fnuz,ck,72,0,43.8675,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,448,8192,512,torch.float8_e4m3fnuz,ck,72,0,26.0849,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,144.07,450.98,0.0 +gfx942,80,448,8192,1536,torch.float8_e4m3fnuz,ck,72,0,43.8247,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,480,8192,512,torch.float8_e4m3fnuz,ck,102,0,28.9604,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,139.04,424.87,0.0 +gfx942,80,480,8192,1536,torch.float8_e4m3fnuz,ck,102,0,43.0079,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,8192,512,torch.float8_e4m3fnuz,ck,85,0,29.9571,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,143.37,428.78,0.0 +gfx942,80,512,8192,1024,torch.float8_e4m3fnuz,ck,85,0,37.882,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2267.55,456.72,0.0 +gfx942,80,512,8192,1536,torch.float8_e4m3fnuz,ck,85,0,50.0471,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1024,8192,512,torch.float8_e4m3fnuz,ck,85,0,50.608,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,169.73,424.75,0.0 +gfx942,80,1024,8192,1024,torch.float8_e4m3fnuz,ck,71,0,67.0889,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2560.76,390.74,0.0 +gfx942,80,1024,8192,1536,torch.float8_e4m3fnuz,ck,71,0,90.3089,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,2048,8192,512,torch.float8_e4m3fnuz,ck,71,0,91.3476,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,188.07,424.72,0.0 +gfx942,80,2048,8192,1024,torch.float8_e4m3fnuz,ck,71,0,120.4715,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2852.11,365.57,0.0 +gfx942,80,2048,8192,1536,torch.float8_e4m3fnuz,ck,71,0,160.9323,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,4096,8192,512,torch.float8_e4m3fnuz,ck,71,0,171.3964,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,200.47,428.25,0.0 +gfx942,80,4096,8192,1024,torch.float8_e4m3fnuz,ck,71,0,227.2969,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3023.34,350.61,0.0 +gfx942,80,4096,8192,1536,torch.float8_e4m3fnuz,ck,71,0,307.3715,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,8192,512,torch.float8_e4m3fnuz,ck,71,0,246.4712,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,209.11,438.2,0.0 +gfx942,80,6144,8192,1536,torch.float8_e4m3fnuz,ck,71,0,455.0275,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,8192,512,torch.float8_e4m3fnuz,ck,71,0,326.6613,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,210.37,436.56,0.0 +gfx942,80,8192,8192,1024,torch.float8_e4m3fnuz,ck,71,0,434.801,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3160.96,347.27,0.0 +gfx942,80,8192,8192,1536,torch.float8_e4m3fnuz,ck,72,0,621.3308,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,10240,8192,512,torch.float8_e4m3fnuz,ck,71,0,399.9462,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,214.78,443.08,0.0 +gfx942,80,10240,8192,1536,torch.float8_e4m3fnuz,ck,71,0,754.6592,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,8192,512,torch.float8_e4m3fnuz,ck,71,0,479.0689,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,215.17,442.13,0.0 +gfx942,80,12288,8192,1536,torch.float8_e4m3fnuz,ck,102,0,912.1361,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,8192,512,torch.float8_e4m3fnuz,ck,71,0,553.8975,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,217.11,444.88,0.0 +gfx942,80,14336,8192,1536,torch.float8_e4m3fnuz,ck,71,0,1039.5469,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,8192,512,torch.float8_e4m3fnuz,ck,71,0,634.9591,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,216.45,442.58,0.0 +gfx942,80,16384,8192,1024,torch.float8_e4m3fnuz,ck,71,0,849.5169,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3235.7,345.61,0.0 +gfx942,80,16384,8192,1536,torch.float8_e4m3fnuz,ck,71,0,1182.4869,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,32768,8192,512,torch.float8_e4m3fnuz,ck,71,0,1248.4259,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,220.18,446.84,0.0 +gfx942,80,32768,8192,1536,torch.float8_e4m3fnuz,ck,71,0,2344.2068,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,8192,512,torch.float8_e4m3fnuz,ck,71,0,2486.4287,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,221.1,447.02,0.0 +gfx942,80,65536,8192,1536,torch.float8_e4m3fnuz,ck,71,0,4684.7126,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,8192,512,torch.float8_e4m3fnuz,ck,71,0,3733.0179,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,220.9,446.06,0.0 +gfx942,80,98304,8192,1536,torch.float8_e4m3fnuz,ck,71,0,7034.6763,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,8192,1536,torch.float8_e4m3fnuz,ck,71,0,9394.7156,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1,9216,4096,torch.float8_e4m3fnuz,ck,11,0,13.8467,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,5.45,2727.82,0.0 +gfx942,80,1,9216,7168,torch.float8_e4m3fnuz,ck,5,0,20.4394,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,6.46,3233.26,0.0 +gfx942,80,2,9216,4096,torch.float8_e4m3fnuz,ck,11,0,13.2579,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,11.39,2850.66,0.0 +gfx942,80,2,9216,7168,torch.float8_e4m3fnuz,ck,5,0,20.6094,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,12.82,3207.83,0.0 +gfx942,80,4,9216,4096,torch.float8_e4m3fnuz,ck,5,0,13.3531,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,22.62,2833.71,0.0 +gfx942,80,4,9216,7168,torch.float8_e4m3fnuz,ck,6,0,20.7042,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,25.53,3195.62,0.0 +gfx942,80,8,9216,4096,torch.float8_e4m3fnuz,ck,11,0,14.0931,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,42.86,2691.31,0.0 +gfx942,80,8,9216,7168,torch.float8_e4m3fnuz,ck,6,0,20.7785,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,50.87,3189.12,0.0 +gfx942,80,16,9216,4096,torch.float8_e4m3fnuz,ck,5,0,14.7532,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,81.88,2583.11,0.0 +gfx942,80,16,9216,7168,torch.float8_e4m3fnuz,ck,6,0,21.4925,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,98.36,3092.7,0.0 +gfx942,80,32,9216,4096,torch.float8_e4m3fnuz,ck,119,0,18.4783,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1307.44,2081.88,0.0 +gfx942,80,32,9216,7168,torch.float8_e4m3fnuz,ck,119,0,26.6078,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,158.9,2513.53,0.0 +gfx942,80,64,9216,4096,torch.float8_e4m3fnuz,ck,114,0,25.0703,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1927.32,1563.23,0.0 +gfx942,80,64,9216,7168,torch.float8_e4m3fnuz,ck,121,0,39.8294,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,212.3,1699.72,0.0 +gfx942,80,96,9216,7168,torch.float8_e4m3fnuz,ck,123,0,50.9032,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,249.17,1346.04,0.0 +gfx942,80,128,9216,4096,torch.float8_e4m3fnuz,ck,121,0,39.1328,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2469.46,1038.32,0.0 +gfx942,80,128,9216,7168,torch.float8_e4m3fnuz,ck,121,0,64.3726,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,262.71,1077.12,0.0 +gfx942,80,160,9216,7168,torch.float8_e4m3fnuz,ck,136,0,75.0573,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,281.64,934.7,0.0 +gfx942,80,192,9216,7168,torch.float8_e4m3fnuz,ck,86,0,85.7848,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,295.71,827.37,0.0 +gfx942,80,224,9216,7168,torch.float8_e4m3fnuz,ck,138,0,101.9633,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,290.25,704.12,0.0 +gfx942,80,256,9216,4096,torch.float8_e4m3fnuz,ck,70,0,64.6741,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2988.42,672.85,0.0 +gfx942,80,256,9216,7168,torch.float8_e4m3fnuz,ck,0,0,109.2884,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,309.48,664.42,0.0 +gfx942,80,288,9216,7168,torch.float8_e4m3fnuz,ck,94,0,120.3302,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,316.22,610.26,0.0 +gfx942,80,320,9216,7168,torch.float8_e4m3fnuz,ck,93,0,125.6795,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,336.4,590.81,0.0 +gfx942,80,352,9216,7168,torch.float8_e4m3fnuz,ck,102,0,159.5041,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,291.57,470.66,0.0 +gfx942,80,384,9216,7168,torch.float8_e4m3fnuz,ck,102,0,155.4479,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,326.37,488.21,0.0 +gfx942,80,416,9216,7168,torch.float8_e4m3fnuz,ck,85,0,192.6714,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,285.26,398.14,0.0 +gfx942,80,448,9216,7168,torch.float8_e4m3fnuz,ck,85,0,189.2582,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,312.75,409.65,0.0 +gfx942,80,480,9216,7168,torch.float8_e4m3fnuz,ck,94,0,183.3908,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,345.81,427.22,0.0 +gfx942,80,512,9216,7168,torch.float8_e4m3fnuz,ck,93,0,197.136,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,343.14,401.59,0.0 +gfx942,80,1024,9216,4096,torch.float8_e4m3fnuz,ck,93,0,219.0276,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3529.67,277.67,0.0 +gfx942,80,1024,9216,7168,torch.float8_e4m3fnuz,ck,93,0,383.3589,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,352.91,240.7,0.0 +gfx942,80,2048,9216,4096,torch.float8_e4m3fnuz,ck,68,0,425.2314,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3636.11,197.27,0.0 +gfx942,80,2048,9216,7168,torch.float8_e4m3fnuz,ck,85,0,734.0056,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,368.64,161.43,0.0 +gfx942,80,4096,9216,4096,torch.float8_e4m3fnuz,ck,71,0,812.5832,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3805.61,160.01,0.0 +gfx942,80,4096,9216,7168,torch.float8_e4m3fnuz,ck,71,0,1429.0113,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,378.7,119.61,0.0 +gfx942,80,4240,9216,4096,torch.float8_e4m3fnuz,ck,93,0,903.6056,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3542.58,147.48,0.0 +gfx942,80,6144,9216,7168,torch.float8_e4m3fnuz,ck,102,0,2153.3558,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,376.97,103.72,0.0 +gfx942,80,8192,9216,7168,torch.float8_e4m3fnuz,ck,71,0,2870.3304,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.08,96.08,0.0 +gfx942,80,10240,9216,7168,torch.float8_e4m3fnuz,ck,93,0,3582.8927,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.6,91.6,0.0 +gfx942,80,12288,9216,7168,torch.float8_e4m3fnuz,ck,102,0,4297.9231,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.74,88.56,0.0 +gfx942,80,14336,9216,7168,torch.float8_e4m3fnuz,ck,93,0,5014.6828,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,377.71,86.36,0.0 +gfx942,80,16384,9216,4096,torch.float8_e4m3fnuz,ck,71,0,3218.0651,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3843.77,126.43,0.0 +gfx942,80,16384,9216,7168,torch.float8_e4m3fnuz,ck,93,0,5708.7709,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.18,85.04,0.0 +gfx942,80,32768,9216,4096,torch.float8_e4m3fnuz,ck,68,0,6437.3643,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3843.03,120.54,0.0 +gfx942,80,32768,9216,7168,torch.float8_e4m3fnuz,ck,85,0,11449.253,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,378.13,79.04,0.0 +gfx942,80,65536,9216,7168,torch.float8_e4m3fnuz,ck,85,0,22829.8274,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.27,76.38,0.0 +gfx942,80,98304,9216,7168,torch.float8_e4m3fnuz,ck,102,0,34028.5473,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.68,75.9,0.0 +gfx942,80,1,11264,1536,torch.float8_e4m3fnuz,ck,108,0,9.3388,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,37.05,1855.22,0.0 +gfx942,80,2,11264,1536,torch.float8_e4m3fnuz,ck,15,0,9.4898,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,11264,1536,torch.float8_e4m3fnuz,ck,10,0,9.667,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,11264,1536,torch.float8_e4m3fnuz,ck,10,0,10.0366,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,11264,1536,torch.float8_e4m3fnuz,ck,10,0,9.7876,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,565.66,1807.03,0.0 +gfx942,80,32,11264,1536,torch.float8_e4m3fnuz,ck,76,0,12.5451,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,882.65,1440.53,0.0 +gfx942,80,48,11264,1536,torch.float8_e4m3fnuz,ck,113,0,15.2534,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1088.9,1210.0,0.0 +gfx942,80,64,11264,1536,torch.float8_e4m3fnuz,ck,112,0,17.4734,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1267.41,1078.3,0.0 +gfx942,80,80,11264,1536,torch.float8_e4m3fnuz,ck,100,0,21.6601,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1278.04,887.65,0.0 +gfx942,80,96,11264,1536,torch.float8_e4m3fnuz,ck,119,0,21.7177,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1529.58,903.03,0.0 +gfx942,80,112,11264,1536,torch.float8_e4m3fnuz,ck,85,0,25.6868,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1508.77,778.48,0.0 +gfx942,80,128,11264,1536,torch.float8_e4m3fnuz,ck,85,0,26.4316,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1675.72,771.11,0.0 +gfx942,80,160,11264,1536,torch.float8_e4m3fnuz,ck,100,0,30.1239,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,192,11264,1536,torch.float8_e4m3fnuz,ck,85,0,31.3634,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2118.32,698.96,0.0 +gfx942,80,224,11264,1536,torch.float8_e4m3fnuz,ck,100,0,37.6656,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,256,11264,1536,torch.float8_e4m3fnuz,ck,85,0,41.1,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2155.32,570.85,0.0 +gfx942,80,288,11264,1536,torch.float8_e4m3fnuz,ck,102,0,42.7952,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,320,11264,1536,torch.float8_e4m3fnuz,ck,85,0,45.0808,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,352,11264,1536,torch.float8_e4m3fnuz,ck,85,0,51.422,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,384,11264,1536,torch.float8_e4m3fnuz,ck,85,0,51.3588,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,11264,1536,torch.float8_e4m3fnuz,ck,85,0,58.2264,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,11264,1536,torch.float8_e4m3fnuz,ck,85,0,57.7888,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,11264,1536,torch.float8_e4m3fnuz,ck,102,0,64.1117,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,11264,1536,torch.float8_e4m3fnuz,ck,85,0,63.9678,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2769.63,463.08,0.0 +gfx942,80,1024,11264,1536,torch.float8_e4m3fnuz,ck,85,0,116.8538,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3032.29,358.94,0.0 +gfx942,80,1536,11264,1536,torch.float8_e4m3fnuz,ck,85,0,168.9501,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3145.91,321.18,0.0 +gfx942,80,2048,11264,1536,torch.float8_e4m3fnuz,ck,71,0,217.6677,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3255.74,305.9,0.0 +gfx942,80,4096,11264,1536,torch.float8_e4m3fnuz,ck,71,0,417.0421,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3398.55,277.83,0.0 +gfx942,80,6144,11264,1536,torch.float8_e4m3fnuz,ck,71,0,614.8496,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,11264,1536,torch.float8_e4m3fnuz,ck,71,0,818.2792,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3464.19,262.05,0.0 +gfx942,80,10240,11264,1536,torch.float8_e4m3fnuz,ck,71,0,1039.4493,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,11264,1536,torch.float8_e4m3fnuz,ck,71,0,1221.6133,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,11264,1536,torch.float8_e4m3fnuz,ck,71,0,1422.2274,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,11264,1536,torch.float8_e4m3fnuz,ck,72,0,1650.9527,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,3433.99,249.29,0.0 +gfx942,80,32768,11264,1536,torch.float8_e4m3fnuz,ck,72,0,3296.9348,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,3439.17,244.42,0.0 +gfx942,80,49152,11264,1536,torch.float8_e4m3fnuz,ck,71,0,4796.016,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3546.29,250.23,0.0 +gfx942,80,65536,11264,1536,torch.float8_e4m3fnuz,ck,71,0,6394.7949,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3546.23,249.32,0.0 +gfx942,80,73728,11264,1536,torch.float8_e4m3fnuz,ck,71,0,7197.2388,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3544.71,248.91,0.0 +gfx942,80,98304,11264,1536,torch.float8_e4m3fnuz,ck,71,0,9673.3902,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,11264,1536,torch.float8_e4m3fnuz,ck,71,0,12851.8716,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3529.05,246.77,0.0 +gfx942,80,1,12288,1536,torch.float8_e4m3fnuz,ck,15,0,8.8721,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,4.25,2130.33,0.0 +gfx942,80,2,12288,1536,torch.float8_e4m3fnuz,ck,15,0,9.1519,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,8.25,2068.05,0.0 +gfx942,80,4,12288,1536,torch.float8_e4m3fnuz,ck,108,0,9.0818,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,16.63,2089.76,0.0 +gfx942,80,8,12288,1536,torch.float8_e4m3fnuz,ck,108,0,9.4892,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,31.82,2011.05,0.0 +gfx942,80,16,12288,1536,torch.float8_e4m3fnuz,ck,10,0,10.1361,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,59.59,1903.31,0.0 +gfx942,80,32,12288,1536,torch.float8_e4m3fnuz,ck,112,0,12.5215,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,96.47,1574.09,0.0 +gfx942,80,64,12288,1536,torch.float8_e4m3fnuz,ck,112,0,17.7365,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,136.21,1158.38,0.0 +gfx942,80,96,12288,1536,torch.float8_e4m3fnuz,ck,133,0,22.9216,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,158.1,932.79,0.0 +gfx942,80,128,12288,1536,torch.float8_e4m3fnuz,ck,119,0,27.07,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,178.49,820.71,0.0 +gfx942,80,160,12288,1536,torch.float8_e4m3fnuz,ck,119,0,31.5739,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,191.29,730.11,0.0 +gfx942,80,192,12288,1536,torch.float8_e4m3fnuz,ck,85,0,34.1912,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,211.98,698.66,0.0 +gfx942,80,224,12288,1536,torch.float8_e4m3fnuz,ck,85,0,42.2156,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,200.3,585.65,0.0 +gfx942,80,256,12288,1536,torch.float8_e4m3fnuz,ck,85,0,43.4262,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,222.53,588.56,0.0 +gfx942,80,288,12288,1536,torch.float8_e4m3fnuz,ck,102,0,48.4492,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,224.39,544.79,0.0 +gfx942,80,320,12288,1536,torch.float8_e4m3fnuz,ck,93,0,51.1407,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,236.2,532.46,0.0 +gfx942,80,352,12288,1536,torch.float8_e4m3fnuz,ck,93,0,60.7878,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,218.59,461.7,0.0 +gfx942,80,384,12288,1536,torch.float8_e4m3fnuz,ck,71,0,59.7854,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,242.46,483.42,0.0 +gfx942,80,416,12288,1536,torch.float8_e4m3fnuz,ck,85,0,69.8237,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,224.9,425.89,0.0 +gfx942,80,448,12288,1536,torch.float8_e4m3fnuz,ck,85,0,69.2979,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,244.04,441.18,0.0 +gfx942,80,480,12288,1536,torch.float8_e4m3fnuz,ck,94,0,71.9873,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,251.7,436.3,0.0 +gfx942,80,512,12288,1536,torch.float8_e4m3fnuz,ck,85,0,75.8898,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,254.68,424.88,0.0 +gfx942,80,1024,12288,1536,torch.float8_e4m3fnuz,ck,71,0,138.0326,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,280.04,330.45,0.0 +gfx942,80,2048,12288,1536,torch.float8_e4m3fnuz,ck,71,0,263.7876,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,293.07,274.28,0.0 +gfx942,80,4096,12288,1536,torch.float8_e4m3fnuz,ck,71,0,499.9388,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,309.28,251.69,0.0 +gfx942,80,6144,12288,1536,torch.float8_e4m3fnuz,ck,71,0,739.7506,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,313.52,242.39,0.0 +gfx942,80,8192,12288,1536,torch.float8_e4m3fnuz,ck,71,0,970.833,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,318.53,239.78,0.0 +gfx942,80,10240,12288,1536,torch.float8_e4m3fnuz,ck,71,0,1218.8252,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,317.15,234.87,0.0 +gfx942,80,12288,12288,1536,torch.float8_e4m3fnuz,ck,71,0,1456.9262,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,318.38,233.19,0.0 +gfx942,80,14336,12288,1536,torch.float8_e4m3fnuz,ck,71,0,1687.663,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.66,232.99,0.0 +gfx942,80,16384,12288,1536,torch.float8_e4m3fnuz,ck,71,0,1931.9974,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.12,231.21,0.0 +gfx942,80,32768,12288,1536,torch.float8_e4m3fnuz,ck,71,0,3848.4946,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.41,227.23,0.0 +gfx942,80,65536,12288,1536,torch.float8_e4m3fnuz,ck,71,0,7676.2381,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.28,225.39,0.0 +gfx942,80,98304,12288,1536,torch.float8_e4m3fnuz,ck,71,0,11500.3768,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.67,224.84,0.0 +gfx942,80,131072,12288,1536,torch.float8_e4m3fnuz,ck,71,0,15314.1682,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,323.09,224.72,0.0 +gfx942,80,1,14336,1536,torch.float8_e4m3fnuz,ck,108,0,9.3708,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,4.7,2353.09,0.0 +gfx942,80,2,14336,1536,torch.float8_e4m3fnuz,ck,108,0,9.4408,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,9.33,2338.84,0.0 +gfx942,80,4,14336,1536,torch.float8_e4m3fnuz,ck,15,0,9.4788,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,18.58,2335.84,0.0 +gfx942,80,8,14336,1536,torch.float8_e4m3fnuz,ck,108,0,9.4769,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,37.18,2349.06,0.0 +gfx942,80,16,14336,1536,torch.float8_e4m3fnuz,ck,15,0,10.5152,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,67.01,2140.09,0.0 +gfx942,80,32,14336,1536,torch.float8_e4m3fnuz,ck,112,0,12.7846,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,110.23,1798.0,0.0 +gfx942,80,64,14336,1536,torch.float8_e4m3fnuz,ck,119,0,18.1404,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,155.38,1320.45,0.0 +gfx942,80,96,14336,1536,torch.float8_e4m3fnuz,ck,120,0,25.9487,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,162.93,960.36,0.0 +gfx942,80,128,14336,1536,torch.float8_e4m3fnuz,ck,85,0,27.8617,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,202.33,929.11,0.0 +gfx942,80,160,14336,1536,torch.float8_e4m3fnuz,ck,119,0,36.4462,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,193.34,736.79,0.0 +gfx942,80,192,14336,1536,torch.float8_e4m3fnuz,ck,86,0,41.4702,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,203.9,670.84,0.0 +gfx942,80,224,14336,1536,torch.float8_e4m3fnuz,ck,72,0,47.9425,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,205.77,600.44,0.0 +gfx942,80,256,14336,1536,torch.float8_e4m3fnuz,ck,72,0,48.7805,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,231.12,609.94,0.0 +gfx942,80,288,14336,1536,torch.float8_e4m3fnuz,ck,85,0,56.6988,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,223.7,541.81,0.0 +gfx942,80,320,14336,1536,torch.float8_e4m3fnuz,ck,85,0,57.1533,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,246.58,554.42,0.0 +gfx942,80,352,14336,1536,torch.float8_e4m3fnuz,ck,85,0,69.3423,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,223.56,470.9,0.0 +gfx942,80,384,14336,1536,torch.float8_e4m3fnuz,ck,85,0,69.3265,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,243.94,484.95,0.0 +gfx942,80,416,14336,1536,torch.float8_e4m3fnuz,ck,85,0,77.2118,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,237.28,447.94,0.0 +gfx942,80,448,14336,1536,torch.float8_e4m3fnuz,ck,85,0,76.7227,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,257.16,463.4,0.0 +gfx942,80,480,14336,1536,torch.float8_e4m3fnuz,ck,86,0,86.6195,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,244.05,421.61,0.0 +gfx942,80,512,14336,1536,torch.float8_e4m3fnuz,ck,71,0,87.7988,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,256.82,426.96,0.0 +gfx942,80,1024,14336,1536,torch.float8_e4m3fnuz,ck,85,0,161.5909,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,279.08,327.7,0.0 +gfx942,80,2048,14336,1536,torch.float8_e4m3fnuz,ck,71,0,300.1671,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,300.48,279.46,0.0 +gfx942,80,4096,14336,1536,torch.float8_e4m3fnuz,ck,71,0,574.396,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,314.05,253.75,0.0 +gfx942,80,6144,14336,1536,torch.float8_e4m3fnuz,ck,71,0,860.7529,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,314.36,241.21,0.0 +gfx942,80,8192,14336,1536,torch.float8_e4m3fnuz,ck,71,0,1133.1069,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,318.4,237.83,0.0 +gfx942,80,10240,14336,1536,torch.float8_e4m3fnuz,ck,71,0,1419.1055,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,317.79,233.49,0.0 +gfx942,80,12288,14336,1536,torch.float8_e4m3fnuz,ck,71,0,1690.3277,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.15,232.63,0.0 +gfx942,80,14336,14336,1536,torch.float8_e4m3fnuz,ck,71,0,1974.9354,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,319.69,230.43,0.0 +gfx942,80,16384,14336,1536,torch.float8_e4m3fnuz,ck,71,0,2260.0129,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,319.27,228.74,0.0 +gfx942,80,32768,14336,1536,torch.float8_e4m3fnuz,ck,71,0,4486.7553,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.64,225.53,0.0 +gfx942,80,65536,14336,1536,torch.float8_e4m3fnuz,ck,71,0,8920.3186,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,323.56,224.4,0.0 +gfx942,80,98304,14336,1536,torch.float8_e4m3fnuz,ck,71,0,13408.709,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.87,223.11,0.0 +gfx942,80,1,16384,512,torch.float8_e4m3fnuz,ck,9,0,6.8396,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.45,1231.34,0.0 +gfx942,80,2,16384,512,torch.float8_e4m3fnuz,ck,9,0,6.6785,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.02,1266.03,0.0 +gfx942,80,4,16384,512,torch.float8_e4m3fnuz,ck,9,0,6.9405,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.67,1227.83,0.0 +gfx942,80,8,16384,512,torch.float8_e4m3fnuz,ck,9,0,6.7881,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,19.77,1275.0,0.0 +gfx942,80,16,16384,512,torch.float8_e4m3fnuz,ck,9,0,7.1281,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,37.66,1251.54,0.0 +gfx942,80,32,16384,512,torch.float8_e4m3fnuz,ck,76,0,8.5341,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,62.91,1107.74,0.0 +gfx942,80,64,16384,512,torch.float8_e4m3fnuz,ck,76,0,11.9905,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,89.55,877.24,0.0 +gfx942,80,96,16384,512,torch.float8_e4m3fnuz,ck,84,0,15.4745,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,104.08,748.55,0.0 +gfx942,80,128,16384,512,torch.float8_e4m3fnuz,ck,76,0,18.6149,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,115.36,679.48,0.0 +gfx942,80,160,16384,512,torch.float8_e4m3fnuz,ck,84,0,21.486,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,124.94,638.25,0.0 +gfx942,80,192,16384,512,torch.float8_e4m3fnuz,ck,84,0,24.2394,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,132.89,609.68,0.0 +gfx942,80,224,16384,512,torch.float8_e4m3fnuz,ck,84,0,28.1282,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,133.61,563.25,0.0 +gfx942,80,256,16384,512,torch.float8_e4m3fnuz,ck,85,0,30.4715,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,140.95,554.89,0.0 +gfx942,80,288,16384,512,torch.float8_e4m3fnuz,ck,84,0,33.0464,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,146.21,543.88,0.0 +gfx942,80,320,16384,512,torch.float8_e4m3fnuz,ck,85,0,34.1726,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,157.11,557.12,0.0 +gfx942,80,352,16384,512,torch.float8_e4m3fnuz,ck,84,0,39.9477,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,147.83,503.24,0.0 +gfx942,80,384,16384,512,torch.float8_e4m3fnuz,ck,85,0,40.3333,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,159.73,524.83,0.0 +gfx942,80,416,16384,512,torch.float8_e4m3fnuz,ck,84,0,45.0184,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,155.03,493.87,0.0 +gfx942,80,448,16384,512,torch.float8_e4m3fnuz,ck,85,0,47.0283,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,159.82,495.4,0.0 +gfx942,80,480,16384,512,torch.float8_e4m3fnuz,ck,84,0,51.972,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,154.95,468.77,0.0 +gfx942,80,512,16384,512,torch.float8_e4m3fnuz,ck,85,0,51.7738,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,165.91,491.14,0.0 +gfx942,80,1024,16384,512,torch.float8_e4m3fnuz,ck,71,0,93.0854,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,184.56,456.22,0.0 +gfx942,80,2048,16384,512,torch.float8_e4m3fnuz,ck,71,0,172.6344,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,199.03,443.4,0.0 +gfx942,80,4096,16384,512,torch.float8_e4m3fnuz,ck,71,0,331.3938,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,207.37,436.65,0.0 +gfx942,80,6144,16384,512,torch.float8_e4m3fnuz,ck,71,0,477.6551,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,215.8,445.64,0.0 +gfx942,80,8192,16384,512,torch.float8_e4m3fnuz,ck,71,0,637.0735,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,215.73,441.11,0.0 +gfx942,80,10240,16384,512,torch.float8_e4m3fnuz,ck,71,0,784.9988,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,218.85,444.81,0.0 +gfx942,80,12288,16384,512,torch.float8_e4m3fnuz,ck,71,0,942.3456,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,218.77,442.87,0.0 +gfx942,80,14336,16384,512,torch.float8_e4m3fnuz,ck,71,0,1094.9387,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,219.66,443.4,0.0 +gfx942,80,16384,16384,512,torch.float8_e4m3fnuz,ck,71,0,1252.4903,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,219.47,442.04,0.0 +gfx942,80,32768,16384,512,torch.float8_e4m3fnuz,ck,71,0,2494.7808,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,220.36,440.48,0.0 +gfx942,80,65536,16384,512,torch.float8_e4m3fnuz,ck,71,0,4931.7756,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,222.94,443.94,0.0 +gfx942,80,98304,16384,512,torch.float8_e4m3fnuz,ck,71,0,7420.0505,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,222.27,442.04,0.0 +gfx942,80,131072,16384,512,torch.float8_e4m3fnuz,ck,9,0,18530.1837,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,118.67,235.86,0.0 +gfx942,80,1,18432,7168,torch.float8_e4m3fnuz,ck,118,0,37.4132,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,7.06,3532.57,0.0 +gfx942,80,2,18432,7168,torch.float8_e4m3fnuz,ck,16,0,37.7738,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,13.99,3500.01,0.0 +gfx942,80,4,18432,7168,torch.float8_e4m3fnuz,ck,109,0,38.1876,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,27.68,3464.39,0.0 +gfx942,80,8,18432,7168,torch.float8_e4m3fnuz,ck,109,0,39.4871,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,53.53,3354.84,0.0 +gfx942,80,16,18432,7168,torch.float8_e4m3fnuz,ck,7,0,40.5749,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,104.2,3273.58,0.0 +gfx942,80,32,18432,7168,torch.float8_e4m3fnuz,ck,133,0,46.9286,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,180.18,2845.38,0.0 +gfx942,80,64,18432,7168,torch.float8_e4m3fnuz,ck,121,0,68.3333,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,247.48,1974.71,0.0 +gfx942,80,96,18432,7168,torch.float8_e4m3fnuz,ck,86,0,88.3315,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,287.18,1543.59,0.0 +gfx942,80,128,18432,7168,torch.float8_e4m3fnuz,ck,0,0,110.5497,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,305.95,1246.11,0.0 +gfx942,80,160,18432,7168,torch.float8_e4m3fnuz,ck,156,0,141.951,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,297.84,980.38,0.0 +gfx942,80,192,18432,7168,torch.float8_e4m3fnuz,ck,102,0,155.4518,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,326.37,904.3,0.0 +gfx942,80,224,18432,7168,torch.float8_e4m3fnuz,ck,40,0,199.1076,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,297.28,713.1,0.0 +gfx942,80,256,18432,7168,torch.float8_e4m3fnuz,ck,93,0,200.0228,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,338.19,716.88,0.0 +gfx942,80,288,18432,7168,torch.float8_e4m3fnuz,ck,94,0,236.0343,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.42,613.48,0.0 +gfx942,80,320,18432,7168,torch.float8_e4m3fnuz,ck,93,0,243.2446,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,347.62,601.09,0.0 +gfx942,80,352,18432,7168,torch.float8_e4m3fnuz,ck,85,0,301.0235,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,308.99,490.39,0.0 +gfx942,80,384,18432,7168,torch.float8_e4m3fnuz,ck,94,0,294.9271,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,344.05,505.31,0.0 +gfx942,80,416,18432,7168,torch.float8_e4m3fnuz,ck,85,0,357.1532,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,307.78,421.21,0.0 +gfx942,80,448,18432,7168,torch.float8_e4m3fnuz,ck,93,0,351.2371,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,337.04,432.32,0.0 +gfx942,80,480,18432,7168,torch.float8_e4m3fnuz,ck,94,0,352.3586,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,359.96,434.94,0.0 +gfx942,80,512,18432,7168,torch.float8_e4m3fnuz,ck,68,0,388.5432,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,348.2,398.06,0.0 +gfx942,80,1024,18432,7168,torch.float8_e4m3fnuz,ck,85,0,742.4427,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,364.45,238.68,0.0 +gfx942,80,2048,18432,7168,torch.float8_e4m3fnuz,ck,71,0,1441.3783,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,375.45,154.23,0.0 +gfx942,80,4096,18432,7168,torch.float8_e4m3fnuz,ck,71,0,2836.9828,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.51,110.14,0.0 +gfx942,80,6144,18432,7168,torch.float8_e4m3fnuz,ck,102,0,4288.8669,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,378.54,93.88,0.0 +gfx942,80,8192,18432,7168,torch.float8_e4m3fnuz,ck,68,0,5709.0564,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.16,86.32,0.0 +gfx942,80,10240,18432,7168,torch.float8_e4m3fnuz,ck,68,0,7143.9568,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,378.76,81.61,0.0 +gfx942,80,12288,18432,7168,torch.float8_e4m3fnuz,ck,85,0,8555.9295,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.5,78.68,0.0 +gfx942,80,14336,18432,7168,torch.float8_e4m3fnuz,ck,85,0,9992.1813,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.11,76.4,0.0 +gfx942,80,16384,18432,7168,torch.float8_e4m3fnuz,ck,93,0,11413.8848,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.3,74.78,0.0 +gfx942,80,32768,18432,7168,torch.float8_e4m3fnuz,ck,85,0,22842.7493,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.05,68.95,0.0 +gfx942,80,65536,18432,7168,torch.float8_e4m3fnuz,ck,93,0,45613.8859,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,379.65,66.16,0.0 +gfx942,80,98304,18432,7168,torch.float8_e4m3fnuz,ck,102,0,68165.4052,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,381.07,65.44,0.0 +gfx942,80,1,20480,1536,torch.float8_e4m3fnuz,ck,16,0,11.0968,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,5.67,2838.64,0.0 +gfx942,80,2,20480,1536,torch.float8_e4m3fnuz,ck,118,0,11.2695,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,11.17,2798.91,0.0 +gfx942,80,4,20480,1536,torch.float8_e4m3fnuz,ck,16,0,11.25,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,22.37,2811.31,0.0 +gfx942,80,8,20480,1536,torch.float8_e4m3fnuz,ck,118,0,11.572,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,43.49,2747.77,0.0 +gfx942,80,16,20480,1536,torch.float8_e4m3fnuz,ck,109,0,12.6053,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,79.86,2549.5,0.0 +gfx942,80,32,20480,1536,torch.float8_e4m3fnuz,ck,112,0,16.0124,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,125.73,2049.48,0.0 +gfx942,80,64,20480,1536,torch.float8_e4m3fnuz,ck,121,0,21.6428,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,186.04,1579.14,0.0 +gfx942,80,96,20480,1536,torch.float8_e4m3fnuz,ck,86,0,28.314,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,213.31,1255.1,0.0 +gfx942,80,128,20480,1536,torch.float8_e4m3fnuz,ck,70,0,35.7174,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,225.47,1033.02,0.0 +gfx942,80,160,20480,1536,torch.float8_e4m3fnuz,ck,156,0,47.001,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,214.17,813.95,0.0 +gfx942,80,192,20480,1536,torch.float8_e4m3fnuz,ck,102,0,49.299,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,245.03,803.6,0.0 +gfx942,80,224,20480,1536,torch.float8_e4m3fnuz,ck,71,0,61.5138,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,229.1,666.13,0.0 +gfx942,80,256,20480,1536,torch.float8_e4m3fnuz,ck,85,0,63.3795,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,254.12,667.98,0.0 +gfx942,80,288,20480,1536,torch.float8_e4m3fnuz,ck,102,0,74.1212,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,244.46,589.52,0.0 +gfx942,80,320,20480,1536,torch.float8_e4m3fnuz,ck,85,0,81.5229,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,246.96,552.68,0.0 +gfx942,80,352,20480,1536,torch.float8_e4m3fnuz,ck,71,0,90.9179,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,243.58,510.53,0.0 +gfx942,80,384,20480,1536,torch.float8_e4m3fnuz,ck,85,0,90.8301,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,265.98,525.99,0.0 +gfx942,80,416,20480,1536,torch.float8_e4m3fnuz,ck,85,0,106.3525,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,246.09,462.01,0.0 +gfx942,80,448,20480,1536,torch.float8_e4m3fnuz,ck,85,0,107.2447,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,262.82,470.84,0.0 +gfx942,80,480,20480,1536,torch.float8_e4m3fnuz,ck,102,0,116.0509,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,260.22,446.83,0.0 +gfx942,80,512,20480,1536,torch.float8_e4m3fnuz,ck,71,0,115.4031,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,279.13,461.12,0.0 +gfx942,80,1024,20480,1536,torch.float8_e4m3fnuz,ck,71,0,217.3761,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,296.37,344.9,0.0 +gfx942,80,2048,20480,1536,torch.float8_e4m3fnuz,ck,71,0,416.8111,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,309.13,284.28,0.0 +gfx942,80,4096,20480,1536,torch.float8_e4m3fnuz,ck,71,0,813.4964,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,316.78,252.64,0.0 +gfx942,80,6144,20480,1536,torch.float8_e4m3fnuz,ck,71,0,1212.4912,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,318.8,241.28,0.0 +gfx942,80,8192,20480,1536,torch.float8_e4m3fnuz,ck,71,0,1603.169,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.49,236.77,0.0 +gfx942,80,10240,20480,1536,torch.float8_e4m3fnuz,ck,71,0,2006.1017,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.14,232.6,0.0 +gfx942,80,12288,20480,1536,torch.float8_e4m3fnuz,ck,71,0,2403.8234,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,321.61,230.32,0.0 +gfx942,80,14336,20480,1536,torch.float8_e4m3fnuz,ck,71,0,2816.6224,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,320.22,227.46,0.0 +gfx942,80,16384,20480,1536,torch.float8_e4m3fnuz,ck,71,0,3191.8526,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,322.94,227.99,0.0 +gfx942,80,32768,20480,1536,torch.float8_e4m3fnuz,ck,71,0,6375.7739,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,323.35,223.34,0.0 +gfx942,80,65536,20480,1536,torch.float8_e4m3fnuz,ck,71,0,12758.9582,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,323.16,220.74,0.0 +gfx942,80,98304,20480,1536,torch.float8_e4m3fnuz,ck,71,0,19087.7671,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,324.02,220.51,0.0 +gfx942,80,1,24576,1536,torch.float8_e4m3fnuz,ck,15,0,13.5093,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,24576,1536,torch.float8_e4m3fnuz,ck,108,0,13.6777,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,24576,1536,torch.float8_e4m3fnuz,ck,15,0,13.9333,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,24576,1536,torch.float8_e4m3fnuz,ck,9,0,14.5085,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,24576,1536,torch.float8_e4m3fnuz,ck,5,0,15.7929,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,32,24576,1536,torch.float8_e4m3fnuz,ck,112,0,19.3781,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1246.73,2031.71,0.0 +gfx942,80,64,24576,1536,torch.float8_e4m3fnuz,ck,93,0,27.4074,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1762.97,1495.68,0.0 +gfx942,80,96,24576,1536,torch.float8_e4m3fnuz,ck,94,0,34.225,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2117.68,1245.14,0.0 +gfx942,80,128,24576,1536,torch.float8_e4m3fnuz,ck,85,0,42.5762,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2269.74,1039.0,0.0 +gfx942,80,160,24576,1536,torch.float8_e4m3fnuz,ck,93,0,55.5159,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2175.88,826.05,0.0 +gfx942,80,192,24576,1536,torch.float8_e4m3fnuz,ck,93,0,55.8139,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2597.12,850.7,0.0 +gfx942,80,224,24576,1536,torch.float8_e4m3fnuz,ck,85,0,71.3416,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2370.49,688.28,0.0 +gfx942,80,256,24576,1536,torch.float8_e4m3fnuz,ck,85,0,72.5312,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2664.7,699.35,0.0 +gfx942,80,288,24576,1536,torch.float8_e4m3fnuz,ck,94,0,79.4368,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2737.18,658.98,0.0 +gfx942,80,320,24576,1536,torch.float8_e4m3fnuz,ck,93,0,84.1633,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2870.51,641.24,0.0 +gfx942,80,352,24576,1536,torch.float8_e4m3fnuz,ck,93,0,101.4118,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2620.51,548.17,0.0 +gfx942,80,384,24576,1536,torch.float8_e4m3fnuz,ck,93,0,99.8625,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2903.09,572.92,0.0 +gfx942,80,416,24576,1536,torch.float8_e4m3fnuz,ck,102,0,122.5387,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,24576,1536,torch.float8_e4m3fnuz,ck,93,0,117.3539,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,24576,1536,torch.float8_e4m3fnuz,ck,102,0,119.4687,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,24576,1536,torch.float8_e4m3fnuz,ck,93,0,126.7363,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1024,24576,1536,torch.float8_e4m3fnuz,ck,68,0,241.8128,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,2048,24576,1536,torch.float8_e4m3fnuz,ck,71,0,459.7917,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,4096,24576,1536,torch.float8_e4m3fnuz,ck,72,0,920.9012,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,6144,24576,1536,torch.float8_e4m3fnuz,ck,71,0,1348.8178,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,24576,1536,torch.float8_e4m3fnuz,ck,71,0,1786.9965,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,10240,24576,1536,torch.float8_e4m3fnuz,ck,71,0,2209.3759,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,24576,1536,torch.float8_e4m3fnuz,ck,71,0,2656.0688,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,24576,1536,torch.float8_e4m3fnuz,ck,71,0,3089.1315,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,24576,1536,torch.float8_e4m3fnuz,ck,71,0,3536.4999,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,32768,24576,1536,torch.float8_e4m3fnuz,ck,71,0,7017.8836,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,24576,1536,torch.float8_e4m3fnuz,ck,0,0,inf,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,24576,1536,torch.float8_e4m3fnuz,ck,0,0,1.0,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,131072,24576,1536,torch.float8_e4m3fnuz,ck,0,0,1.0,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1,32768,512,torch.float8_e4m3fnuz,ck,9,0,9.7066,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,1,32768,1536,torch.float8_e4m3fnuz,ck,15,0,17.5611,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,32768,512,torch.float8_e4m3fnuz,ck,9,0,9.513,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,32768,1536,torch.float8_e4m3fnuz,ck,108,0,17.4059,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,4,32768,512,torch.float8_e4m3fnuz,ck,23,0,9.1702,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0.0,0.0,0.0 +gfx942,80,4,32768,1536,torch.float8_e4m3fnuz,ck,109,0,18.3707,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,32768,512,torch.float8_e4m3fnuz,ck,9,0,9.9258,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,8,32768,1536,torch.float8_e4m3fnuz,ck,5,0,18.2531,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,32768,512,torch.float8_e4m3fnuz,ck,9,0,10.2342,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,32768,1536,torch.float8_e4m3fnuz,ck,5,0,19.2975,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,32,32768,512,torch.float8_e4m3fnuz,ck,76,0,12.0918,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,32,32768,1536,torch.float8_e4m3fnuz,ck,119,0,24.3999,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,64,32768,512,torch.float8_e4m3fnuz,ck,85,0,18.7478,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,64,32768,1536,torch.float8_e4m3fnuz,ck,101,0,34.2699,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,96,32768,512,torch.float8_e4m3fnuz,ck,84,0,23.8023,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,96,32768,1536,torch.float8_e4m3fnuz,ck,102,0,43.5312,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,128,32768,512,torch.float8_e4m3fnuz,ck,85,0,28.3083,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,128,32768,1536,torch.float8_e4m3fnuz,ck,85,0,54.7332,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,160,32768,512,torch.float8_e4m3fnuz,ck,84,0,34.3591,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,160,32768,1536,torch.float8_e4m3fnuz,ck,100,0,73.0357,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,192,32768,512,torch.float8_e4m3fnuz,ck,85,0,37.0759,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,192,32768,1536,torch.float8_e4m3fnuz,ck,85,0,73.3389,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,224,32768,512,torch.float8_e4m3fnuz,ck,84,0,45.0439,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,224,32768,1536,torch.float8_e4m3fnuz,ck,85,0,91.0766,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,256,32768,512,torch.float8_e4m3fnuz,ck,85,0,46.3119,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,256,32768,1536,torch.float8_e4m3fnuz,ck,85,0,92.2438,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,288,32768,512,torch.float8_e4m3fnuz,ck,84,0,53.506,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,288,32768,1536,torch.float8_e4m3fnuz,ck,102,0,101.9306,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,320,32768,512,torch.float8_e4m3fnuz,ck,85,0,55.0543,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,320,32768,1536,torch.float8_e4m3fnuz,ck,85,0,114.3911,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,352,32768,512,torch.float8_e4m3fnuz,ck,72,0,64.3624,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,352,32768,1536,torch.float8_e4m3fnuz,ck,71,0,132.552,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,384,32768,512,torch.float8_e4m3fnuz,ck,85,0,63.438,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,384,32768,1536,torch.float8_e4m3fnuz,ck,71,0,132.9488,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,32768,512,torch.float8_e4m3fnuz,ck,84,0,73.9008,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,416,32768,1536,torch.float8_e4m3fnuz,ck,85,0,155.3704,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,32768,512,torch.float8_e4m3fnuz,ck,85,0,72.2692,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,32768,1536,torch.float8_e4m3fnuz,ck,85,0,153.3388,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,32768,512,torch.float8_e4m3fnuz,ck,102,0,81.2012,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,32768,1536,torch.float8_e4m3fnuz,ck,102,0,155.1241,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,32768,512,torch.float8_e4m3fnuz,ck,85,0,82.7804,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,32768,1536,torch.float8_e4m3fnuz,ck,71,0,167.8113,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1024,32768,512,torch.float8_e4m3fnuz,ck,71,0,148.0686,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1024,32768,1536,torch.float8_e4m3fnuz,ck,71,0,317.4319,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,2048,32768,512,torch.float8_e4m3fnuz,ck,71,0,283.8954,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,2048,32768,1536,torch.float8_e4m3fnuz,ck,71,0,610.8983,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,4096,32768,512,torch.float8_e4m3fnuz,ck,71,0,554.3222,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,4096,32768,1536,torch.float8_e4m3fnuz,ck,71,0,1193.686,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,32768,512,torch.float8_e4m3fnuz,ck,71,0,793.4262,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,32768,1536,torch.float8_e4m3fnuz,ck,102,0,1794.6908,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,32768,512,torch.float8_e4m3fnuz,ck,71,0,1053.245,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,32768,1536,torch.float8_e4m3fnuz,ck,71,0,2345.8711,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,10240,32768,512,torch.float8_e4m3fnuz,ck,72,0,1357.7711,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,10240,32768,1536,torch.float8_e4m3fnuz,ck,71,0,2923.8951,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,32768,512,torch.float8_e4m3fnuz,ck,71,0,1573.8691,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,32768,1536,torch.float8_e4m3fnuz,ck,71,0,3526.7572,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,32768,512,torch.float8_e4m3fnuz,ck,71,0,1827.3845,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,32768,1536,torch.float8_e4m3fnuz,ck,71,0,4108.8811,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,32768,512,torch.float8_e4m3fnuz,ck,71,0,2090.8366,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,32768,1536,torch.float8_e4m3fnuz,ck,71,0,4672.9569,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,32768,32768,512,torch.float8_e4m3fnuz,ck,71,0,4165.6734,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,32768,32768,1536,torch.float8_e4m3fnuz,ck,71,0,9432.9791,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,32768,512,torch.float8_e4m3fnuz,ck,74,0,1.0,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0.0,0.0,0.0 +gfx942,80,65536,32768,1536,torch.float8_e4m3fnuz,ck,0,0,1.0,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,98304,32768,1536,torch.float8_e4m3fnuz,ck,0,0,1.0,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1,36864,7168,torch.float8_e4m3fnuz,ck,6,0,76.2189,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,2,36864,7168,torch.float8_e4m3fnuz,ck,20,0,78.3165,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,0.0,0.0,0.0 +gfx942,80,4,36864,7168,torch.float8_e4m3fnuz,ck,32,0,78.8657,a8w8_bpreshuffle_256x16x512x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v2,0.0,0.0,0.0 +gfx942,80,8,36864,7168,torch.float8_e4m3fnuz,ck,111,0,79.5469,a8w8_bpreshuffle_256x16x512x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,16,36864,7168,torch.float8_e4m3fnuz,ck,6,0,80.7001,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,32,36864,7168,torch.float8_e4m3fnuz,ck,133,0,93.517,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0.0,0.0,0.0 +gfx942,80,64,36864,7168,torch.float8_e4m3fnuz,ck,121,0,129.978,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,96,36864,7168,torch.float8_e4m3fnuz,ck,102,0,155.5081,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,128,36864,7168,torch.float8_e4m3fnuz,ck,93,0,201.1007,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,160,36864,7168,torch.float8_e4m3fnuz,ck,156,0,267.3258,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0.0,0.0,0.0 +gfx942,80,192,36864,7168,torch.float8_e4m3fnuz,ck,94,0,286.0543,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,224,36864,7168,torch.float8_e4m3fnuz,ck,40,0,377.2671,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,256,36864,7168,torch.float8_e4m3fnuz,ck,68,0,371.2046,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,288,36864,7168,torch.float8_e4m3fnuz,ck,102,0,437.3066,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,320,36864,7168,torch.float8_e4m3fnuz,ck,128,0,476.2091,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,352,36864,7168,torch.float8_e4m3fnuz,ck,71,0,568.6663,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,384,36864,7168,torch.float8_e4m3fnuz,ck,94,0,555.2415,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,416,36864,7168,torch.float8_e4m3fnuz,ck,93,0,649.7863,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,448,36864,7168,torch.float8_e4m3fnuz,ck,93,0,631.7091,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,480,36864,7168,torch.float8_e4m3fnuz,ck,102,0,655.0492,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,512,36864,7168,torch.float8_e4m3fnuz,ck,68,0,726.0635,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,1024,36864,7168,torch.float8_e4m3fnuz,ck,93,0,1406.3674,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,2048,36864,7168,torch.float8_e4m3fnuz,ck,93,0,2759.912,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,4096,36864,7168,torch.float8_e4m3fnuz,ck,93,0,5492.6804,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,6144,36864,7168,torch.float8_e4m3fnuz,ck,93,0,8229.8778,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,8192,36864,7168,torch.float8_e4m3fnuz,ck,93,0,10983.7146,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,10240,36864,7168,torch.float8_e4m3fnuz,ck,93,0,13738.3404,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,12288,36864,7168,torch.float8_e4m3fnuz,ck,93,0,16426.5845,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,14336,36864,7168,torch.float8_e4m3fnuz,ck,93,0,19165.8682,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,16384,36864,7168,torch.float8_e4m3fnuz,ck,71,0,21975.6028,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,32768,36864,7168,torch.float8_e4m3fnuz,ck,93,0,43711.479,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx942,80,65536,36864,7168,torch.float8_e4m3fnuz,ck,0,0,1.0,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0.0,0.0,0.0 +gfx950,256,1,256,7168,torch.float8_e4m3fn,cktile,138,0,7.4399,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,0.49,247.68,0.0 +gfx950,256,2,256,7168,torch.float8_e4m3fn,cktile,138,0,7.5203,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,0.98,246.05,0.0 +gfx950,256,4,256,7168,torch.float8_e4m3fn,cktile,138,0,7.6107,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,1.93,245.15,0.0 +gfx950,256,8,256,7168,torch.float8_e4m3fn,cktile,30,0,7.8028,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,3.76,243.05,0.0 +gfx950,256,16,256,7168,torch.float8_e4m3fn,ck,10,0,6.9469,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,8.45,281.84,0.0 +gfx950,256,32,256,7168,torch.float8_e4m3fn,ck,8,0,6.939,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,16.92,299.87,0.0 +gfx950,256,64,256,7168,torch.float8_e4m3fn,ck,8,0,8.0014,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,29.35,290.77,0.0 +gfx950,256,128,256,7168,torch.float8_e4m3fn,ck,10,0,9.8581,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,47.65,285.86,0.0 +gfx950,256,256,256,7168,torch.float8_e4m3fn,ck,8,0,10.3554,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,90.73,367.06,0.0 +gfx950,256,512,256,7168,torch.float8_e4m3fn,ck,10,0,10.5211,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,178.6,548.15,0.0 +gfx950,256,1024,256,7168,torch.float8_e4m3fn,ck,5,0,11.7135,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,320.83,828.05,0.0 +gfx950,256,2048,256,7168,torch.float8_e4m3fn,cktile,9,0,13.9827,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x128_default,537.54,1256.1,0.0 +gfx950,256,4096,256,7168,torch.float8_e4m3fn,ck,114,0,20.5991,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,729.76,1616.2,0.0 +gfx950,256,8192,256,7168,torch.float8_e4m3fn,cktile,24,0,30.5979,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x128x256_1x4x1_16x16x128_default,982.58,2116.14,0.0 +gfx950,256,16384,256,7168,torch.float8_e4m3fn,ck,139,0,44.5889,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1348.53,2863.14,0.0 +gfx950,256,32768,256,7168,torch.float8_e4m3fn,ck,66,0,75.1594,a8w8_bpreshuffle_256x128x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1600.05,3372.74,0.0 +gfx950,256,1,2112,7168,torch.float8_e4m3fn,ck,10,0,11.5479,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.62,1311.94,0.0 +gfx950,256,2,2112,7168,torch.float8_e4m3fn,cktile,30,0,11.761,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,5.15,1289.14,0.0 +gfx950,256,4,2112,7168,torch.float8_e4m3fn,ck,10,0,11.4824,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.55,1322.41,0.0 +gfx950,256,8,2112,7168,torch.float8_e4m3fn,ck,10,0,11.727,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,20.65,1298.71,0.0 +gfx950,256,16,2112,7168,torch.float8_e4m3fn,ck,10,0,10.6299,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,45.57,1441.32,0.0 +gfx950,256,32,2112,7168,torch.float8_e4m3fn,ck,8,0,10.4988,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,92.29,1476.68,0.0 +gfx950,256,64,2112,7168,torch.float8_e4m3fn,ck,10,0,10.6664,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,181.67,1487.65,0.0 +gfx950,256,96,2112,7168,torch.float8_e4m3fn,cktile,137,0,12.0882,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_16x64x512_1x4x1_16x16x128_default,240.45,1342.83,0.0 +gfx950,256,128,2112,7168,torch.float8_e4m3fn,ck,5,0,13.7387,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,282.09,1208.05,0.0 +gfx950,256,256,2112,7168,torch.float8_e4m3fn,cktile,152,0,16.4078,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_32x64x512_1x4x1_16x16x128_default,472.4,1100.4,0.0 +gfx950,256,512,2112,7168,torch.float8_e4m3fn,cktile,22,0,22.4044,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_64x64x256_1x4x1_16x16x128_default,691.92,936.04,0.0 +gfx950,256,1024,2112,7168,torch.float8_e4m3fn,cktile,225,0,30.7438,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_80x64x256_1x4x1_16x16x128_default,1008.47,871.86,0.0 +gfx950,256,2048,2112,7168,torch.float8_e4m3fn,cktile,131,0,43.1862,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x192x256_1x4x1_16x16x128_default,1435.84,890.79,0.0 +gfx950,256,4096,2112,7168,torch.float8_e4m3fn,cktile,131,0,68.6384,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x192x256_1x4x1_16x16x128_default,1806.82,900.38,0.0 +gfx950,256,8192,2112,7168,torch.float8_e4m3fn,flydsl,485,0,119.6395,flydsl_bpreshuflle_128x192x128_F8_F8_B16_2x0x1x1_default,2073.18,906.57,0.0 +gfx950,256,16384,2112,7168,torch.float8_e4m3fn,cktile,121,0,218.3492,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x128_default,2271.91,924.14,0.0 +gfx950,256,32768,2112,7168,torch.float8_e4m3fn,cktile,121,0,404.4086,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x128_default,2453.3,960.49,0.0 +gfx950,256,1,3072,1536,torch.float8_e4m3fn,ck,8,0,4.9938,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,1.89,946.43,0.0 +gfx950,256,2,3072,1536,torch.float8_e4m3fn,ck,10,0,4.9926,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,3.78,948.19,0.0 +gfx950,256,4,3072,1536,torch.float8_e4m3fn,ck,8,0,5.0475,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,7.48,940.92,0.0 +gfx950,256,8,3072,1536,torch.float8_e4m3fn,ck,10,0,5.4956,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,13.74,869.79,0.0 +gfx950,256,16,3072,1536,torch.float8_e4m3fn,ck,8,0,5.1326,a8w8_bpreshuffle_128x32x16x512_16x16_16x16_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_intrawave_v1,29.42,943.28,0.0 +gfx950,256,32,3072,1536,torch.float8_e4m3fn,flydsl,499,0,4.9778,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x1x0x1_default,60.67,997.3,0.0 +gfx950,256,64,3072,1536,torch.float8_e4m3fn,flydsl,1289,0,5.0902,flydsl_bpreshuflle_16x64x512_F8_F8_B16_1x0x0x4_default,118.66,1023.56,0.0 +gfx950,256,96,3072,1536,torch.float8_e4m3fn,flydsl,424,0,5.3379,flydsl_bpreshuflle_32x64x512_F8_F8_B16_1x0x1x1_default,169.72,1022.1,0.0 +gfx950,256,128,3072,1536,torch.float8_e4m3fn,flydsl,1318,0,5.7223,flydsl_bpreshuflle_32x64x512_F8_F8_B16_2x0x0x4_default,211.1,996.39,0.0 +gfx950,256,256,3072,1536,torch.float8_e4m3fn,ck,119,0,7.1386,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,338.43,936.41,0.0 +gfx950,256,512,3072,1536,torch.float8_e4m3fn,ck,112,0,8.925,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,541.38,969.27,0.0 +gfx950,256,1024,3072,1536,torch.float8_e4m3fn,flydsl,873,0,11.6927,flydsl_bpreshuflle_64x192x256_F8_F8_B16_2x1x0x2_default,826.47,1076.13,0.0 +gfx950,256,2048,3072,1536,torch.float8_e4m3fn,flydsl,815,0,16.6873,flydsl_bpreshuflle_128x192x128_F8_F8_B16_2x0x1x2_default,1158.21,1225.32,0.0 +gfx950,256,4096,3072,1536,torch.float8_e4m3fn,flydsl,651,0,24.8402,flydsl_bpreshuflle_128x192x128_F8_F8_B16_2x1x1x1_default,1556.14,1456.34,0.0 +gfx950,256,8192,3072,1536,torch.float8_e4m3fn,flydsl,319,0,48.914,flydsl_bpreshuflle_128x192x128_F8_F8_B16_2x1x1x0_default,1580.52,1382.7,0.0 +gfx950,256,16384,3072,1536,torch.float8_e4m3fn,flydsl,601,0,92.0763,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x1_default,1679.25,1417.82,0.0 +gfx950,256,20480,3072,1536,torch.float8_e4m3fn,ck,58,0,125.715,a8w8_bpreshuffle_256x160x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1537.39,1288.67,0.0 +gfx950,256,32768,3072,1536,torch.float8_e4m3fn,cktile,121,0,192.7525,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_160x192x128_1x4x1_16x16x128_default,1604.32,1330.08,0.0 +gfx950,256,1,4096,512,torch.float8_e4m3fn,cktile,30,0,2.8621,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,1.47,735.77,0.0 +gfx950,256,2,4096,512,torch.float8_e4m3fn,cktile,2,0,2.6128,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,3.21,809.31,0.0 +gfx950,256,4,4096,512,torch.float8_e4m3fn,cktile,2,0,2.8132,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,5.96,757.84,0.0 +gfx950,256,8,4096,512,torch.float8_e4m3fn,cktile,30,0,2.6844,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,12.5,807.18,0.0 +gfx950,256,16,4096,512,torch.float8_e4m3fn,cktile,33,0,3.0517,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x256_1x4x1_16x16x128_default,21.99,732.84,0.0 +gfx950,256,32,4096,512,torch.float8_e4m3fn,cktile,30,0,3.1527,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,42.57,753.54,0.0 +gfx950,256,64,4096,512,torch.float8_e4m3fn,cktile,30,0,3.319,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,80.88,799.7,0.0 +gfx950,256,128,4096,512,torch.float8_e4m3fn,cktile,30,0,3.4228,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,156.85,938.2,0.0 +gfx950,256,256,4096,512,torch.float8_e4m3fn,flydsl,758,0,4.5124,flydsl_bpreshuflle_64x64x256_F8_F8_B16_1x0x1x2_default,237.95,958.55,0.0 +gfx950,256,512,4096,512,torch.float8_e4m3fn,cktile,110,0,6.4827,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_48x64x256_1x4x1_16x16x128_default,331.26,1010.94,0.0 +gfx950,256,1024,4096,512,torch.float8_e4m3fn,flydsl,16,0,8.0485,flydsl_bpreshuflle_64x256x128_F8_F8_B16_1x0x0x0_default,533.64,1367.96,0.0 +gfx950,256,2048,4096,512,torch.float8_e4m3fn,ck,86,0,13.3525,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,643.32,1492.08,0.0 +gfx950,256,4096,4096,512,torch.float8_e4m3fn,flydsl,186,0,16.832,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x0_default,1020.67,2242.68,0.0 +gfx950,256,8192,4096,512,torch.float8_e4m3fn,ck,86,0,41.8389,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,821.24,1754.36,0.0 +gfx950,256,16384,4096,512,torch.float8_e4m3fn,flydsl,269,0,63.474,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x1x0_default,1082.64,2279.73,0.0 +gfx950,256,20480,4096,512,torch.float8_e4m3fn,ck,86,0,91.9844,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,933.85,1960.71,0.0 +gfx950,256,32768,4096,512,torch.float8_e4m3fn,ck,86,0,143.1262,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,960.26,2007.39,0.0 +gfx950,256,1,7168,2048,torch.float8_e4m3fn,cktile,30,0,5.8098,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,5.05,2529.6,0.0 +gfx950,256,2,7168,2048,torch.float8_e4m3fn,cktile,137,0,5.8346,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_16x64x512_1x4x1_16x16x128_default,10.06,2521.65,0.0 +gfx950,256,4,7168,2048,torch.float8_e4m3fn,cktile,138,0,5.8415,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,20.1,2524.28,0.0 +gfx950,256,8,7168,2048,torch.float8_e4m3fn,cktile,138,0,5.9349,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,39.58,2495.6,0.0 +gfx950,256,16,7168,2048,torch.float8_e4m3fn,ck,10,0,6.0372,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,77.81,2475.02,0.0 +gfx950,256,32,7168,2048,torch.float8_e4m3fn,cktile,137,0,6.2634,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_16x64x512_1x4x1_16x16x128_default,150.0,2427.49,0.0 +gfx950,256,64,7168,2048,torch.float8_e4m3fn,cktile,37,0,7.2353,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,259.71,2173.88,0.0 +gfx950,256,96,7168,2048,torch.float8_e4m3fn,ck,113,0,8.5986,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,327.79,1890.18,0.0 +gfx950,256,128,7168,2048,torch.float8_e4m3fn,ck,114,0,8.3615,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,449.45,2006.48,0.0 +gfx950,256,256,7168,2048,torch.float8_e4m3fn,flydsl,565,0,11.2626,flydsl_bpreshuflle_128x64x256_F8_F8_B16_2x1x0x1_default,667.36,1675.84,0.0 +gfx950,256,512,7168,2048,torch.float8_e4m3fn,flydsl,152,0,15.7151,flydsl_bpreshuflle_128x128x256_F8_F8_B16_2x0x1x0_default,956.56,1467.93,0.0 +gfx950,256,1024,7168,2048,torch.float8_e4m3fn,ck,70,0,24.0038,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1252.5,1310.51,0.0 +gfx950,256,2048,7168,2048,torch.float8_e4m3fn,flydsl,979,0,39.185,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,1534.5,1230.94,0.0 +gfx950,256,4096,7168,2048,torch.float8_e4m3fn,flydsl,979,0,73.2445,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,1641.89,1116.66,0.0 +gfx950,256,8192,7168,2048,torch.float8_e4m3fn,flydsl,979,0,130.9968,flydsl_bpreshuflle_128x256x128_F8_F8_B16_2x1x1x2_default,1836.06,1136.65,0.0 +gfx950,256,16384,7168,2048,torch.float8_e4m3fn,flydsl,186,0,246.697,flydsl_bpreshuflle_128x256x128_F8_F8_B16_1x1x0x0_default,1949.91,1147.62,0.0 +gfx950,256,20480,7168,2048,torch.float8_e4m3fn,cktile,115,0,316.101,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,1902.23,1107.95,0.0 +gfx950,256,32768,7168,2048,torch.float8_e4m3fn,cktile,115,0,494.3189,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_96x256x128_1x4x1_16x16x128_default,1946.26,1115.78,0.0 diff --git a/aiter/configs/model_configs/dsv3_a4w4_blockscale_tuned_gemm.csv b/aiter/configs/model_configs/dsv3_a4w4_blockscale_tuned_gemm.csv index fe1d39c330..708f7f6d33 100644 --- a/aiter/configs/model_configs/dsv3_a4w4_blockscale_tuned_gemm.csv +++ b/aiter/configs/model_configs/dsv3_a4w4_blockscale_tuned_gemm.csv @@ -1,31 +1,31 @@ -cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,1,7168,4608,21,0,9.8118,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.73,1684.88,0.0 -256,2,7168,4608,21,0,9.3912,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,14.07,1762.11,0.0 -256,4,7168,4608,21,0,9.431,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,28.02,1758.21,0.0 -256,8,7168,4608,21,0,9.5886,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,55.12,1736.25,0.0 -256,16,7168,4608,21,0,10.387,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,101.76,1615.61,0.0 -256,32,7168,4608,29,0,10.6068,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,199.3,1607.23,0.0 -256,64,7168,4608,21,0,10.8416,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,389.97,1621.53,0.0 -256,128,7168,4608,21,0,10.8814,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,777.08,1713.47,0.0 -256,256,7168,4608,29,0,11.9871,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1410.8,1733.11,0.0 -256,512,7168,4608,42,0,16.7699,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2016.88,1492.84,0.0 -256,1024,7168,4608,53,0,24.7988,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,2727.78,1353.07,0.0 -256,2048,7168,4608,54,0,35.4237,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3819.24,1428.25,0.0 -256,4096,7168,4608,54,0,75.3418,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3591.41,1123.85,0.0 -256,8192,7168,4608,45,0,141.2117,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3832.3,1082.28,0.0 -256,16384,7168,4608,54,0,245.9159,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4401.23,1175.79,0.0 -256,1,9216,7168,37,0,14.8363,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,8.91,2227.79,0.0 -256,2,9216,7168,29,0,14.3596,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,18.4,2303.28,0.0 -256,4,9216,7168,29,0,13.3742,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,39.52,2476.28,0.0 -256,8,9216,7168,21,0,13.8391,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,76.38,2399.45,0.0 -256,16,9216,7168,21,0,14.3003,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,147.82,2334.38,0.0 -256,32,9216,7168,21,0,15.2927,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,276.46,2205.93,0.0 -256,64,9216,7168,21,0,15.2787,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,553.43,2254.06,0.0 -256,128,9216,7168,29,0,17.2059,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,982.89,2083.48,0.0 -256,256,9216,7168,37,0,19.2284,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1759.01,2010.89,0.0 -256,512,9216,7168,49,0,26.407,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2561.66,1677.67,0.0 -256,1024,9216,7168,47,0,36.953,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3661.18,1503.92,0.0 -256,2048,9216,7168,47,0,80.7199,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3352.12,967.78,0.0 -256,4096,9216,7168,54,0,135.3548,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3998.13,910.26,0.0 -256,8192,9216,7168,54,0,236.4348,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4577.72,902.51,0.0 -256,16384,9216,7168,54,0,451.2622,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4796.91,872.53,0.0 +gfx,cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,1,7168,4608,21,0,9.8118,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,6.73,1684.88,0.0 +gfx950,256,2,7168,4608,21,0,9.3912,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,14.07,1762.11,0.0 +gfx950,256,4,7168,4608,21,0,9.431,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,28.02,1758.21,0.0 +gfx950,256,8,7168,4608,21,0,9.5886,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,55.12,1736.25,0.0 +gfx950,256,16,7168,4608,21,0,10.387,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,101.76,1615.61,0.0 +gfx950,256,32,7168,4608,29,0,10.6068,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,199.3,1607.23,0.0 +gfx950,256,64,7168,4608,21,0,10.8416,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,389.97,1621.53,0.0 +gfx950,256,128,7168,4608,21,0,10.8814,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,777.08,1713.47,0.0 +gfx950,256,256,7168,4608,29,0,11.9871,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,1410.8,1733.11,0.0 +gfx950,256,512,7168,4608,42,0,16.7699,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,2016.88,1492.84,0.0 +gfx950,256,1024,7168,4608,53,0,24.7988,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128E,2727.78,1353.07,0.0 +gfx950,256,2048,7168,4608,54,0,35.4237,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3819.24,1428.25,0.0 +gfx950,256,4096,7168,4608,54,0,75.3418,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3591.41,1123.85,0.0 +gfx950,256,8192,7168,4608,45,0,141.2117,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,3832.3,1082.28,0.0 +gfx950,256,16384,7168,4608,54,0,245.9159,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4401.23,1175.79,0.0 +gfx950,256,1,9216,7168,37,0,14.8363,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,8.91,2227.79,0.0 +gfx950,256,2,9216,7168,29,0,14.3596,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,18.4,2303.28,0.0 +gfx950,256,4,9216,7168,29,0,13.3742,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,39.52,2476.28,0.0 +gfx950,256,8,9216,7168,21,0,13.8391,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,76.38,2399.45,0.0 +gfx950,256,16,9216,7168,21,0,14.3003,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,147.82,2334.38,0.0 +gfx950,256,32,9216,7168,21,0,15.2927,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,276.46,2205.93,0.0 +gfx950,256,64,9216,7168,21,0,15.2787,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128E,553.43,2254.06,0.0 +gfx950,256,128,9216,7168,29,0,17.2059,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,982.89,2083.48,0.0 +gfx950,256,256,9216,7168,37,0,19.2284,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128E,1759.01,2010.89,0.0 +gfx950,256,512,9216,7168,49,0,26.407,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128E,2561.66,1677.67,0.0 +gfx950,256,1024,9216,7168,47,0,36.953,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3661.18,1503.92,0.0 +gfx950,256,2048,9216,7168,47,0,80.7199,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256E,3352.12,967.78,0.0 +gfx950,256,4096,9216,7168,54,0,135.3548,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,3998.13,910.26,0.0 +gfx950,256,8192,9216,7168,54,0,236.4348,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4577.72,902.51,0.0 +gfx950,256,16384,9216,7168,54,0,451.2622,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,4796.91,872.53,0.0 diff --git a/aiter/configs/model_configs/dsv3_a8w8_bpreshuffle_tuned_gemm.csv b/aiter/configs/model_configs/dsv3_a8w8_bpreshuffle_tuned_gemm.csv index 97e90bcfad..3cb9ab1345 100644 --- a/aiter/configs/model_configs/dsv3_a8w8_bpreshuffle_tuned_gemm.csv +++ b/aiter/configs/model_configs/dsv3_a8w8_bpreshuffle_tuned_gemm.csv @@ -1,46 +1,46 @@ -cu_num,M,N,K,q_dtype_w,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,1,6144,1536,torch.float8_e4m3fn,ck,5,0,5.7647,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,3.27,1639.46,0.0 -256,2,6144,1536,torch.float8_e4m3fn,ck,10,0,5.7535,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,6.56,1645.06,0.0 -256,4,6144,1536,torch.float8_e4m3fn,ck,10,0,5.7738,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,13.08,1644.06,0.0 -256,8,6144,1536,torch.float8_e4m3fn,ck,10,0,5.824,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,25.93,1639.38,0.0 -256,16,6144,1536,torch.float8_e4m3fn,ck,10,0,5.8011,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,52.06,1664.92,0.0 -256,32,6144,1536,torch.float8_e4m3fn,ck,10,0,6.0087,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,100.52,1644.21,0.0 -256,64,6144,1536,torch.float8_e4m3fn,ck,5,0,6.8238,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,177.02,1512.64,0.0 -256,128,6144,1536,torch.float8_e4m3fn,ck,119,0,7.7745,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,310.75,1441.46,0.0 -256,256,6144,1536,torch.float8_e4m3fn,ck,112,0,10.024,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,482.03,1294.51,0.0 -256,512,6144,1536,torch.float8_e4m3fn,ck,77,0,12.9924,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,743.79,1271.13,0.0 -256,1024,6144,1536,torch.float8_e4m3fn,ck,121,0,19.3447,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,999.1,1219.61,0.0 -256,2048,6144,1536,torch.float8_e4m3fn,ck,68,0,30.1273,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1283.05,1252.97,0.0 -256,4096,6144,1536,torch.float8_e4m3fn,ck,141,0,57.5611,a8w8_bpreshuffle_256x160x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1343.08,1147.66,0.0 -256,8192,6144,1536,torch.float8_e4m3fn,ck,141,0,103.7775,a8w8_bpreshuffle_256x160x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1489.91,1182.18,0.0 -256,16384,6144,1536,torch.float8_e4m3fn,ck,143,0,190.1067,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1626.65,1241.04,0.0 -256,1,7168,4096,torch.float8_e4m3fn,cktile,138,0,9.338,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,6.29,3146.13,0.0 -256,2,7168,4096,torch.float8_e4m3fn,ck,10,0,9.2269,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,12.73,3186.01,0.0 -256,4,7168,4096,torch.float8_e4m3fn,cktile,138,0,9.5269,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,24.65,3089.55,0.0 -256,8,7168,4096,torch.float8_e4m3fn,cktile,2,0,9.6702,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,48.58,3051.39,0.0 -256,16,7168,4096,torch.float8_e4m3fn,ck,24,0,8.9318,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,105.19,3320.16,0.0 -256,32,7168,4096,torch.float8_e4m3fn,cktile,137,0,9.7916,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_16x64x512_1x4x1_16x16x128_default,191.9,3058.74,0.0 -256,64,7168,4096,torch.float8_e4m3fn,cktile,37,0,11.3316,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,331.65,2695.1,0.0 -256,128,7168,4096,torch.float8_e4m3fn,ck,114,0,14.084,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,533.67,2252.16,0.0 -256,256,7168,4096,torch.float8_e4m3fn,ck,65,0,17.5236,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,857.84,1944.73,0.0 -256,512,7168,4096,torch.float8_e4m3fn,ck,139,0,25.1676,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1194.58,1541.56,0.0 -256,1024,7168,4096,torch.float8_e4m3fn,ck,154,0,38.5605,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1559.36,1250.88,0.0 -256,2048,7168,4096,torch.float8_e4m3fn,ck,33,0,69.3264,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1734.68,968.01,0.0 -256,4096,7168,4096,torch.float8_e4m3fn,ck,51,0,128.1913,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1876.24,817.98,0.0 -256,8192,7168,4096,torch.float8_e4m3fn,ck,51,0,235.0881,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2046.2,767.18,0.0 -256,16384,7168,4096,torch.float8_e4m3fn,ck,33,0,440.2175,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2185.45,752.7,0.0 -256,1,8192,512,torch.float8_e4m3fn,cktile,30,0,3.5429,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,2.37,1188.63,0.0 -256,2,8192,512,torch.float8_e4m3fn,cktile,30,0,3.5942,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,4.67,1176.37,0.0 -256,4,8192,512,torch.float8_e4m3fn,cktile,30,0,3.525,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,9.52,1209.05,0.0 -256,8,8192,512,torch.float8_e4m3fn,cktile,138,0,3.6239,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,18.52,1194.7,0.0 -256,16,8192,512,torch.float8_e4m3fn,cktile,138,0,3.6015,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,37.27,1239.66,0.0 -256,32,8192,512,torch.float8_e4m3fn,cktile,2,0,3.6727,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,73.09,1289.24,0.0 -256,64,8192,512,torch.float8_e4m3fn,cktile,37,0,4.8855,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,109.89,1079.86,0.0 -256,128,8192,512,torch.float8_e4m3fn,ck,76,0,5.5876,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,192.17,1137.7,0.0 -256,256,8192,512,torch.float8_e4m3fn,ck,84,0,6.6419,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,323.32,1282.72,0.0 -256,512,8192,512,torch.float8_e4m3fn,ck,78,0,9.5127,a8w8_bpreshuffle_256x96x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,451.5,1350.31,0.0 -256,1024,8192,512,torch.float8_e4m3fn,ck,86,0,13.4701,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,637.7,1595.82,0.0 -256,2048,8192,512,torch.float8_e4m3fn,ck,86,0,22.0387,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,779.53,1760.42,0.0 -256,4096,8192,512,torch.float8_e4m3fn,ck,86,0,40.9594,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,838.87,1792.03,0.0 -256,8192,8192,512,torch.float8_e4m3fn,ck,86,0,74.4073,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,923.56,1916.56,0.0 -256,16384,8192,512,torch.float8_e4m3fn,ck,143,0,138.1708,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,994.7,2033.85,0.0 +gfx,cu_num,M,N,K,q_dtype_w,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,1,6144,1536,torch.float8_e4m3fn,ck,5,0,5.7647,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,3.27,1639.46,0.0 +gfx950,256,2,6144,1536,torch.float8_e4m3fn,ck,10,0,5.7535,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,6.56,1645.06,0.0 +gfx950,256,4,6144,1536,torch.float8_e4m3fn,ck,10,0,5.7738,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,13.08,1644.06,0.0 +gfx950,256,8,6144,1536,torch.float8_e4m3fn,ck,10,0,5.824,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,25.93,1639.38,0.0 +gfx950,256,16,6144,1536,torch.float8_e4m3fn,ck,10,0,5.8011,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,52.06,1664.92,0.0 +gfx950,256,32,6144,1536,torch.float8_e4m3fn,ck,10,0,6.0087,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,100.52,1644.21,0.0 +gfx950,256,64,6144,1536,torch.float8_e4m3fn,ck,5,0,6.8238,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,177.02,1512.64,0.0 +gfx950,256,128,6144,1536,torch.float8_e4m3fn,ck,119,0,7.7745,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,310.75,1441.46,0.0 +gfx950,256,256,6144,1536,torch.float8_e4m3fn,ck,112,0,10.024,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,482.03,1294.51,0.0 +gfx950,256,512,6144,1536,torch.float8_e4m3fn,ck,77,0,12.9924,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,743.79,1271.13,0.0 +gfx950,256,1024,6144,1536,torch.float8_e4m3fn,ck,121,0,19.3447,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,999.1,1219.61,0.0 +gfx950,256,2048,6144,1536,torch.float8_e4m3fn,ck,68,0,30.1273,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1283.05,1252.97,0.0 +gfx950,256,4096,6144,1536,torch.float8_e4m3fn,ck,141,0,57.5611,a8w8_bpreshuffle_256x160x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1343.08,1147.66,0.0 +gfx950,256,8192,6144,1536,torch.float8_e4m3fn,ck,141,0,103.7775,a8w8_bpreshuffle_256x160x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1489.91,1182.18,0.0 +gfx950,256,16384,6144,1536,torch.float8_e4m3fn,ck,143,0,190.1067,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1626.65,1241.04,0.0 +gfx950,256,1,7168,4096,torch.float8_e4m3fn,cktile,138,0,9.338,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,6.29,3146.13,0.0 +gfx950,256,2,7168,4096,torch.float8_e4m3fn,ck,10,0,9.2269,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,12.73,3186.01,0.0 +gfx950,256,4,7168,4096,torch.float8_e4m3fn,cktile,138,0,9.5269,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,24.65,3089.55,0.0 +gfx950,256,8,7168,4096,torch.float8_e4m3fn,cktile,2,0,9.6702,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,48.58,3051.39,0.0 +gfx950,256,16,7168,4096,torch.float8_e4m3fn,ck,24,0,8.9318,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,105.19,3320.16,0.0 +gfx950,256,32,7168,4096,torch.float8_e4m3fn,cktile,137,0,9.7916,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x3_16x64x512_1x4x1_16x16x128_default,191.9,3058.74,0.0 +gfx950,256,64,7168,4096,torch.float8_e4m3fn,cktile,37,0,11.3316,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,331.65,2695.1,0.0 +gfx950,256,128,7168,4096,torch.float8_e4m3fn,ck,114,0,14.084,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,533.67,2252.16,0.0 +gfx950,256,256,7168,4096,torch.float8_e4m3fn,ck,65,0,17.5236,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,857.84,1944.73,0.0 +gfx950,256,512,7168,4096,torch.float8_e4m3fn,ck,139,0,25.1676,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1194.58,1541.56,0.0 +gfx950,256,1024,7168,4096,torch.float8_e4m3fn,ck,154,0,38.5605,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1559.36,1250.88,0.0 +gfx950,256,2048,7168,4096,torch.float8_e4m3fn,ck,33,0,69.3264,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1734.68,968.01,0.0 +gfx950,256,4096,7168,4096,torch.float8_e4m3fn,ck,51,0,128.1913,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1876.24,817.98,0.0 +gfx950,256,8192,7168,4096,torch.float8_e4m3fn,ck,51,0,235.0881,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2046.2,767.18,0.0 +gfx950,256,16384,7168,4096,torch.float8_e4m3fn,ck,33,0,440.2175,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2185.45,752.7,0.0 +gfx950,256,1,8192,512,torch.float8_e4m3fn,cktile,30,0,3.5429,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,2.37,1188.63,0.0 +gfx950,256,2,8192,512,torch.float8_e4m3fn,cktile,30,0,3.5942,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,4.67,1176.37,0.0 +gfx950,256,4,8192,512,torch.float8_e4m3fn,cktile,30,0,3.525,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_16x64x512_1x4x1_16x16x128_default,9.52,1209.05,0.0 +gfx950,256,8,8192,512,torch.float8_e4m3fn,cktile,138,0,3.6239,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,18.52,1194.7,0.0 +gfx950,256,16,8192,512,torch.float8_e4m3fn,cktile,138,0,3.6015,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x4_16x64x512_1x4x1_16x16x128_default,37.27,1239.66,0.0 +gfx950,256,32,8192,512,torch.float8_e4m3fn,cktile,2,0,3.6727,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_16x64x512_1x4x1_16x16x128_default,73.09,1289.24,0.0 +gfx950,256,64,8192,512,torch.float8_e4m3fn,cktile,37,0,4.8855,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x2_32x64x512_1x4x1_16x16x128_default,109.89,1079.86,0.0 +gfx950,256,128,8192,512,torch.float8_e4m3fn,ck,76,0,5.5876,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,192.17,1137.7,0.0 +gfx950,256,256,8192,512,torch.float8_e4m3fn,ck,84,0,6.6419,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,323.32,1282.72,0.0 +gfx950,256,512,8192,512,torch.float8_e4m3fn,ck,78,0,9.5127,a8w8_bpreshuffle_256x96x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,451.5,1350.31,0.0 +gfx950,256,1024,8192,512,torch.float8_e4m3fn,ck,86,0,13.4701,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,637.7,1595.82,0.0 +gfx950,256,2048,8192,512,torch.float8_e4m3fn,ck,86,0,22.0387,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,779.53,1760.42,0.0 +gfx950,256,4096,8192,512,torch.float8_e4m3fn,ck,86,0,40.9594,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,838.87,1792.03,0.0 +gfx950,256,8192,8192,512,torch.float8_e4m3fn,ck,86,0,74.4073,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,923.56,1916.56,0.0 +gfx950,256,16384,8192,512,torch.float8_e4m3fn,ck,143,0,138.1708,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,994.7,2033.85,0.0 diff --git a/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv b/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv index 36e2d91b5e..679280c3b6 100644 --- a/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv @@ -1,59 +1,59 @@ -cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4782,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0195,0.49,492.75 -256,2,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4609,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0332,0.98,495.88 -256,4,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.5195,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0293,1.95,495.96 -256,8,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.5756,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0273,3.88,500.13 -256,16,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.6807,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0308,7.65,508.75 -256,32,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.8114,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0326,15.03,530.65 -256,48,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,8.172,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0277,21.56,536.31 -256,64,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,8.4622,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0312,27.76,545.99 -256,80,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,16,8.7848,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0322,33.42,552.98 -256,96,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,16,8.9146,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0306,39.52,571.58 -256,112,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.5363,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0296,43.1,559.23 -256,128,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.2913,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0224,45.65,541.29 -256,256,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,287,8,12.8749,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.023,72.97,580.28 -256,1,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.2196,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0265,3.28,3286.06 -256,2,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.2149,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0282,6.57,3289.75 -256,80,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,15.4972,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0178,156.3,2049.56 -256,112,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,19.3948,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0106,174.85,1668.3 -256,128,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,19.5107,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0106,198.64,1673.61 -256,256,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,28.0092,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.006,276.73,1250.62 -256,1,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.3014,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0169,1.5,1499.1 -256,2,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.4095,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0163,2.94,1475.25 -256,4,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.5084,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0136,5.8,1455.66 -256,8,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,6.6916,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0064,11.28,1421.32 -256,16,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,6.7833,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0077,22.26,1412.98 -256,32,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,7.7348,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0077,39.04,1258.22 -256,48,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,8.9019,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0058,50.89,1109.83 -256,64,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,8.9485,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0027,67.5,1120.52 -256,80,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,9.4262,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0077,80.09,1079.38 -256,96,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,2,9.8468,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0027,92.01,1048.25 -256,112,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,10.092,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0028,104.73,1037.39 -256,128,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,10.0569,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0028,120.11,1055.68 -256,256,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,12.3425,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,195.74,955.76 -256,1,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,8.3747,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0151,3.51,3508.01 -256,2,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,8.5564,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0158,6.86,3435.67 -256,4,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,244,8,9.2613,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0139,12.68,3178.16 -256,8,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,114,4,9.2997,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0084,25.26,3172.96 -256,16,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,9.4278,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0088,49.83,3145.49 -256,32,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.3011,auto,0.0,83.14,2650.18 -256,48,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.3303,auto,0.0,114.29,2452.89 -256,64,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.0008,auto,0.0,156.58,2544.81 -256,80,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,13.9539,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0034,168.33,2209.75 -256,96,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,14.1437,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0034,199.28,2200.95 -256,112,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,15.4791,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,212.44,2030.13 -256,128,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,15.4483,auto,0.0,243.27,2053.26 -256,256,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.9002,auto,0.0,343.2,1556.09 -256,1,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.8589,auto,0.0,5.05,5052.81 -256,2,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.5282,auto,0.0,10.18,5090.54 -256,4,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.1821,auto,0.0,20.51,5131.6 -256,8,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.1407,auto,0.0,41.06,5140.44 -256,16,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,39.1932,auto,0.0,94.58,5930.01 -256,32,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,41.086,auto,0.0,180.44,5674.99 -256,48,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.4891,auto,0.0,169.8,3571.73 -256,64,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,66.227,auto,0.0,223.88,3543.2 -256,80,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.8314,auto,0.0,244.41,3104.28 -256,96,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.5926,auto,0.0,294.21,3123.97 -256,112,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,77.7086,auto,0.0,333.9,3048.51 -256,128,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,77.9492,auto,0.0,380.42,3048.67 -256,256,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,135.7516,auto,0.0,436.88,1794.55 +gfx,cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +gfx950,256,1,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4782,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0195,0.49,492.75 +gfx950,256,2,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4609,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0332,0.98,495.88 +gfx950,256,4,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.5195,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0293,1.95,495.96 +gfx950,256,8,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.5756,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0273,3.88,500.13 +gfx950,256,16,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.6807,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0308,7.65,508.75 +gfx950,256,32,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.8114,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0326,15.03,530.65 +gfx950,256,48,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,8.172,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0277,21.56,536.31 +gfx950,256,64,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,8.4622,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0312,27.76,545.99 +gfx950,256,80,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,16,8.7848,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0322,33.42,552.98 +gfx950,256,96,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,16,8.9146,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0306,39.52,571.58 +gfx950,256,112,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.5363,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0296,43.1,559.23 +gfx950,256,128,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.2913,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0224,45.65,541.29 +gfx950,256,256,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,287,8,12.8749,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.023,72.97,580.28 +gfx950,256,1,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.2196,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0265,3.28,3286.06 +gfx950,256,2,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.2149,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0282,6.57,3289.75 +gfx950,256,80,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,15.4972,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0178,156.3,2049.56 +gfx950,256,112,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,19.3948,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0106,174.85,1668.3 +gfx950,256,128,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,19.5107,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0106,198.64,1673.61 +gfx950,256,256,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,28.0092,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.006,276.73,1250.62 +gfx950,256,1,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.3014,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0169,1.5,1499.1 +gfx950,256,2,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.4095,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0163,2.94,1475.25 +gfx950,256,4,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.5084,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0136,5.8,1455.66 +gfx950,256,8,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,6.6916,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0064,11.28,1421.32 +gfx950,256,16,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,6.7833,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0077,22.26,1412.98 +gfx950,256,32,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,7.7348,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0077,39.04,1258.22 +gfx950,256,48,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,8.9019,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0058,50.89,1109.83 +gfx950,256,64,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,8.9485,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0027,67.5,1120.52 +gfx950,256,80,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,9.4262,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0077,80.09,1079.38 +gfx950,256,96,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,2,9.8468,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0027,92.01,1048.25 +gfx950,256,112,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,10.092,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0028,104.73,1037.39 +gfx950,256,128,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,10.0569,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0028,120.11,1055.68 +gfx950,256,256,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,12.3425,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,195.74,955.76 +gfx950,256,1,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,8.3747,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0151,3.51,3508.01 +gfx950,256,2,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,8.5564,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0158,6.86,3435.67 +gfx950,256,4,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,244,8,9.2613,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0139,12.68,3178.16 +gfx950,256,8,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,114,4,9.2997,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0084,25.26,3172.96 +gfx950,256,16,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,9.4278,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0088,49.83,3145.49 +gfx950,256,32,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.3011,auto,0.0,83.14,2650.18 +gfx950,256,48,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.3303,auto,0.0,114.29,2452.89 +gfx950,256,64,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.0008,auto,0.0,156.58,2544.81 +gfx950,256,80,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,13.9539,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0034,168.33,2209.75 +gfx950,256,96,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,14.1437,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0034,199.28,2200.95 +gfx950,256,112,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,15.4791,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,212.44,2030.13 +gfx950,256,128,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,15.4483,auto,0.0,243.27,2053.26 +gfx950,256,256,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.9002,auto,0.0,343.2,1556.09 +gfx950,256,1,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.8589,auto,0.0,5.05,5052.81 +gfx950,256,2,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.5282,auto,0.0,10.18,5090.54 +gfx950,256,4,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.1821,auto,0.0,20.51,5131.6 +gfx950,256,8,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.1407,auto,0.0,41.06,5140.44 +gfx950,256,16,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,39.1932,auto,0.0,94.58,5930.01 +gfx950,256,32,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,41.086,auto,0.0,180.44,5674.99 +gfx950,256,48,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.4891,auto,0.0,169.8,3571.73 +gfx950,256,64,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,66.227,auto,0.0,223.88,3543.2 +gfx950,256,80,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.8314,auto,0.0,244.41,3104.28 +gfx950,256,96,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.5926,auto,0.0,294.21,3123.97 +gfx950,256,112,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,77.7086,auto,0.0,333.9,3048.51 +gfx950,256,128,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,77.9492,auto,0.0,380.42,3048.67 +gfx950,256,256,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,135.7516,auto,0.0,436.88,1794.55 diff --git a/aiter/configs/model_configs/glm5_a8w8_blockscale_bpreshuffle_tuned_gemm.csv b/aiter/configs/model_configs/glm5_a8w8_blockscale_bpreshuffle_tuned_gemm.csv index d0ba82a60d..a3f918d358 100644 --- a/aiter/configs/model_configs/glm5_a8w8_blockscale_bpreshuffle_tuned_gemm.csv +++ b/aiter/configs/model_configs/glm5_a8w8_blockscale_bpreshuffle_tuned_gemm.csv @@ -1,81 +1,81 @@ -cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -256,1,128,6144,ck,7,0,10.229,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.15,77.51,0.0 -256,2,128,6144,ck,7,0,10.0576,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.31,79.47,0.0 -256,4,128,6144,ck,7,0,10.1416,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.62,80.07,0.0 -256,8,128,6144,ck,7,0,10.5937,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.19,79.07,0.0 -256,16,128,6144,ck,7,0,10.2885,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.45,86.39,0.0 -256,32,128,6144,ck,7,0,9.8484,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.11,100.65,0.0 -256,48,128,6144,ck,7,0,9.7423,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.75,112.26,0.0 -256,64,128,6144,ck,7,0,10.3252,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.75,115.84,0.0 -256,128,128,6144,ck,7,0,10.6055,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.98,151.4,0.0 -256,256,128,6144,ck,7,0,11.7011,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,34.41,207.23,0.0 -256,512,128,6144,ck,7,0,14.8179,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.35,274.21,0.0 -256,1024,128,6144,ck,7,0,15.3151,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,105.17,479.27,0.0 -256,2048,128,6144,ck,7,0,14.9456,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,215.53,929.61,0.0 -256,4096,128,6144,ck,12,0,16.2952,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,395.36,1656.98,0.0 -256,8192,128,6144,ck,17,0,20.3201,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,634.1,2618.85,0.0 -256,16384,128,6144,ck,17,0,35.0789,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,734.62,3011.61,0.0 -256,1,2624,6144,ck,7,0,17.0117,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.9,948.36,0.0 -256,2,2624,6144,ck,7,0,16.8584,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.83,957.66,0.0 -256,4,2624,6144,ck,7,0,17.3416,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.44,932.29,0.0 -256,8,2624,6144,ck,7,0,17.3829,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.84,932.7,0.0 -256,16,2624,6144,ck,7,0,15.5025,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,33.28,1051.71,0.0 -256,32,2624,6144,ck,7,0,15.5856,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,66.2,1057.8,0.0 -256,48,2624,6144,ck,7,0,15.1016,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,102.49,1103.77,0.0 -256,64,2624,6144,ck,7,0,15.0601,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,137.02,1118.91,0.0 -256,128,2624,6144,ck,12,0,16.4056,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,251.57,1071.59,0.0 -256,256,2624,6144,ck,17,0,18.765,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,439.88,1014.56,0.0 -256,512,2624,6144,ck,17,0,28.3073,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,583.2,775.58,0.0 -256,1024,2624,6144,ck,15,0,41.707,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,791.66,666.25,0.0127 -256,2048,2624,6144,cktile,27,0,75.9502,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,869.45,519.45,0.0 -256,4096,2624,6144,cktile,28,0,83.0256,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1590.72,756.19,0.0 -256,8192,2624,6144,cktile,11,0,160.8269,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1642.39,680.52,0.0 -256,16384,2624,6144,cktile,11,0,309.3895,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1707.49,655.38,0.0 -256,1,3072,6144,ck,7,0,16.977,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.22,1112.48,0.0 -256,2,3072,6144,ck,7,0,17.362,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.35,1088.52,0.0 -256,4,3072,6144,ck,7,0,17.4624,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.65,1083.67,0.0 -256,8,3072,6144,ck,7,0,17.2502,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,17.51,1099.85,0.0 -256,16,3072,6144,ck,7,0,15.612,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,38.69,1221.56,0.0 -256,32,3072,6144,ck,7,0,15.6472,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,77.2,1231.38,0.0 -256,48,3072,6144,ck,7,0,15.5615,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,116.44,1250.79,0.0 -256,64,3072,6144,ck,7,0,15.5445,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,155.42,1264.81,0.0 -256,128,3072,6144,ck,12,0,16.6159,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,290.8,1230.58,0.0 -256,256,3072,6144,ck,17,0,19.217,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,502.87,1145.87,0.0 -256,512,3072,6144,ck,17,0,28.6558,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,674.47,878.21,0.0 -256,1024,3072,6144,ck,14,0,41.032,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,942.06,766.65,0.0001 -256,2048,3072,6144,ck,13,0,63.463,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1218.18,693.95,0.0 -256,4096,3072,6144,ck,13,0,100.7871,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1534.11,686.66,0.0 -256,8192,3072,6144,ck,13,0,182.1825,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1697.41,656.14,0.0 -256,16384,3072,6144,cktile,11,0,334.1006,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1851.16,659.09,0.0 -256,1,3584,512,ck,6,0,3.2478,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,1.13,567.36,0.0 -256,2,3584,512,ck,7,0,3.0064,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.44,615.48,0.0 -256,4,3584,512,ck,6,0,3.3027,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,4.44,564.91,0.0 -256,8,3584,512,ck,7,0,3.5929,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.17,527.83,0.0 -256,16,3584,512,ck,6,0,3.3578,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,17.49,583.09,0.0 -256,32,3584,512,ck,12,0,3.4342,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,34.2,605.9,0.0 -256,48,3584,512,ck,5,0,3.8924,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,45.26,566.14,0.0 -256,64,3584,512,ck,7,0,3.3788,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,69.52,688.57,0.0 -256,128,3584,512,ck,7,0,3.9047,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,120.31,721.71,0.0 -256,256,3584,512,ck,11,0,4.6357,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,202.67,819.96,0.0 -256,512,3584,512,ck,16,0,5.9296,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,316.89,972.61,0.0 -256,1024,3584,512,ck,14,0,8.4491,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,444.79,1147.97,0.0 -256,2048,3584,512,ck,9,0,13.2647,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,566.63,1324.09,0.0005 -256,4096,3584,512,ck,16,0,21.0249,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,714.98,1583.47,0.0 -256,8192,3584,512,ck,16,0,37.8785,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,793.72,1709.4,0.0 -256,16384,3584,512,ck,16,0,67.628,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,889.12,1887.74,0.0 -256,1,6144,1536,ck,7,0,7.1899,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.63,1314.48,0.0 -256,2,6144,1536,ck,7,0,6.7452,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.6,1403.2,0.0 -256,4,6144,1536,ck,7,0,6.5478,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.53,1449.72,0.0 -256,8,6144,1536,ck,7,0,6.5386,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.09,1460.22,0.0 -256,16,6144,1536,ck,7,0,6.5494,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.11,1474.7,0.0 -256,32,6144,1536,ck,7,0,6.4059,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,94.28,1542.26,0.0 -256,48,6144,1536,ck,6,0,6.6653,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,135.92,1515.42,0.0 -256,64,6144,1536,ck,12,0,6.7173,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,179.83,1536.62,0.0 -256,128,6144,1536,ck,11,0,8.7179,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,277.12,1285.48,0.0 -256,256,6144,1536,ck,12,0,10.3453,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,467.06,1254.3,0.0 -256,512,6144,1536,ck,15,0,14.5306,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,665.06,1136.57,0.0122 -256,1024,6144,1536,ck,14,0,20.896,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,924.93,1129.07,0.0002 -256,2048,6144,1536,ck,13,0,37.4058,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1033.39,1009.17,0.0 -256,4096,6144,1536,ck,13,0,62.9487,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1228.13,1049.43,0.0 -256,8192,6144,1536,ck,13,0,114.464,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1350.81,1071.81,0.0 -256,16384,6144,1536,ck,13,0,211.6928,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1460.78,1114.49,0.0 +gfx,cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,1,128,6144,ck,7,0,10.229,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.15,77.51,0.0 +gfx950,256,2,128,6144,ck,7,0,10.0576,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.31,79.47,0.0 +gfx950,256,4,128,6144,ck,7,0,10.1416,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.62,80.07,0.0 +gfx950,256,8,128,6144,ck,7,0,10.5937,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.19,79.07,0.0 +gfx950,256,16,128,6144,ck,7,0,10.2885,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.45,86.39,0.0 +gfx950,256,32,128,6144,ck,7,0,9.8484,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.11,100.65,0.0 +gfx950,256,48,128,6144,ck,7,0,9.7423,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.75,112.26,0.0 +gfx950,256,64,128,6144,ck,7,0,10.3252,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.75,115.84,0.0 +gfx950,256,128,128,6144,ck,7,0,10.6055,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.98,151.4,0.0 +gfx950,256,256,128,6144,ck,7,0,11.7011,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,34.41,207.23,0.0 +gfx950,256,512,128,6144,ck,7,0,14.8179,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.35,274.21,0.0 +gfx950,256,1024,128,6144,ck,7,0,15.3151,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,105.17,479.27,0.0 +gfx950,256,2048,128,6144,ck,7,0,14.9456,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,215.53,929.61,0.0 +gfx950,256,4096,128,6144,ck,12,0,16.2952,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,395.36,1656.98,0.0 +gfx950,256,8192,128,6144,ck,17,0,20.3201,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,634.1,2618.85,0.0 +gfx950,256,16384,128,6144,ck,17,0,35.0789,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,734.62,3011.61,0.0 +gfx950,256,1,2624,6144,ck,7,0,17.0117,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.9,948.36,0.0 +gfx950,256,2,2624,6144,ck,7,0,16.8584,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.83,957.66,0.0 +gfx950,256,4,2624,6144,ck,7,0,17.3416,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.44,932.29,0.0 +gfx950,256,8,2624,6144,ck,7,0,17.3829,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.84,932.7,0.0 +gfx950,256,16,2624,6144,ck,7,0,15.5025,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,33.28,1051.71,0.0 +gfx950,256,32,2624,6144,ck,7,0,15.5856,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,66.2,1057.8,0.0 +gfx950,256,48,2624,6144,ck,7,0,15.1016,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,102.49,1103.77,0.0 +gfx950,256,64,2624,6144,ck,7,0,15.0601,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,137.02,1118.91,0.0 +gfx950,256,128,2624,6144,ck,12,0,16.4056,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,251.57,1071.59,0.0 +gfx950,256,256,2624,6144,ck,17,0,18.765,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,439.88,1014.56,0.0 +gfx950,256,512,2624,6144,ck,17,0,28.3073,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,583.2,775.58,0.0 +gfx950,256,1024,2624,6144,ck,15,0,41.707,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,791.66,666.25,0.0127 +gfx950,256,2048,2624,6144,cktile,27,0,75.9502,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,869.45,519.45,0.0 +gfx950,256,4096,2624,6144,cktile,28,0,83.0256,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1590.72,756.19,0.0 +gfx950,256,8192,2624,6144,cktile,11,0,160.8269,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1642.39,680.52,0.0 +gfx950,256,16384,2624,6144,cktile,11,0,309.3895,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1707.49,655.38,0.0 +gfx950,256,1,3072,6144,ck,7,0,16.977,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.22,1112.48,0.0 +gfx950,256,2,3072,6144,ck,7,0,17.362,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.35,1088.52,0.0 +gfx950,256,4,3072,6144,ck,7,0,17.4624,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.65,1083.67,0.0 +gfx950,256,8,3072,6144,ck,7,0,17.2502,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,17.51,1099.85,0.0 +gfx950,256,16,3072,6144,ck,7,0,15.612,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,38.69,1221.56,0.0 +gfx950,256,32,3072,6144,ck,7,0,15.6472,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,77.2,1231.38,0.0 +gfx950,256,48,3072,6144,ck,7,0,15.5615,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,116.44,1250.79,0.0 +gfx950,256,64,3072,6144,ck,7,0,15.5445,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,155.42,1264.81,0.0 +gfx950,256,128,3072,6144,ck,12,0,16.6159,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,290.8,1230.58,0.0 +gfx950,256,256,3072,6144,ck,17,0,19.217,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,502.87,1145.87,0.0 +gfx950,256,512,3072,6144,ck,17,0,28.6558,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,674.47,878.21,0.0 +gfx950,256,1024,3072,6144,ck,14,0,41.032,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,942.06,766.65,0.0001 +gfx950,256,2048,3072,6144,ck,13,0,63.463,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1218.18,693.95,0.0 +gfx950,256,4096,3072,6144,ck,13,0,100.7871,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1534.11,686.66,0.0 +gfx950,256,8192,3072,6144,ck,13,0,182.1825,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1697.41,656.14,0.0 +gfx950,256,16384,3072,6144,cktile,11,0,334.1006,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1851.16,659.09,0.0 +gfx950,256,1,3584,512,ck,6,0,3.2478,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,1.13,567.36,0.0 +gfx950,256,2,3584,512,ck,7,0,3.0064,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.44,615.48,0.0 +gfx950,256,4,3584,512,ck,6,0,3.3027,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,4.44,564.91,0.0 +gfx950,256,8,3584,512,ck,7,0,3.5929,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.17,527.83,0.0 +gfx950,256,16,3584,512,ck,6,0,3.3578,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,17.49,583.09,0.0 +gfx950,256,32,3584,512,ck,12,0,3.4342,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,34.2,605.9,0.0 +gfx950,256,48,3584,512,ck,5,0,3.8924,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,45.26,566.14,0.0 +gfx950,256,64,3584,512,ck,7,0,3.3788,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,69.52,688.57,0.0 +gfx950,256,128,3584,512,ck,7,0,3.9047,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,120.31,721.71,0.0 +gfx950,256,256,3584,512,ck,11,0,4.6357,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,202.67,819.96,0.0 +gfx950,256,512,3584,512,ck,16,0,5.9296,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,316.89,972.61,0.0 +gfx950,256,1024,3584,512,ck,14,0,8.4491,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,444.79,1147.97,0.0 +gfx950,256,2048,3584,512,ck,9,0,13.2647,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,566.63,1324.09,0.0005 +gfx950,256,4096,3584,512,ck,16,0,21.0249,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,714.98,1583.47,0.0 +gfx950,256,8192,3584,512,ck,16,0,37.8785,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,793.72,1709.4,0.0 +gfx950,256,16384,3584,512,ck,16,0,67.628,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,889.12,1887.74,0.0 +gfx950,256,1,6144,1536,ck,7,0,7.1899,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.63,1314.48,0.0 +gfx950,256,2,6144,1536,ck,7,0,6.7452,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.6,1403.2,0.0 +gfx950,256,4,6144,1536,ck,7,0,6.5478,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.53,1449.72,0.0 +gfx950,256,8,6144,1536,ck,7,0,6.5386,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.09,1460.22,0.0 +gfx950,256,16,6144,1536,ck,7,0,6.5494,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.11,1474.7,0.0 +gfx950,256,32,6144,1536,ck,7,0,6.4059,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,94.28,1542.26,0.0 +gfx950,256,48,6144,1536,ck,6,0,6.6653,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,135.92,1515.42,0.0 +gfx950,256,64,6144,1536,ck,12,0,6.7173,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,179.83,1536.62,0.0 +gfx950,256,128,6144,1536,ck,11,0,8.7179,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,277.12,1285.48,0.0 +gfx950,256,256,6144,1536,ck,12,0,10.3453,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,467.06,1254.3,0.0 +gfx950,256,512,6144,1536,ck,15,0,14.5306,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,665.06,1136.57,0.0122 +gfx950,256,1024,6144,1536,ck,14,0,20.896,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,924.93,1129.07,0.0002 +gfx950,256,2048,6144,1536,ck,13,0,37.4058,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1033.39,1009.17,0.0 +gfx950,256,4096,6144,1536,ck,13,0,62.9487,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1228.13,1049.43,0.0 +gfx950,256,8192,6144,1536,ck,13,0,114.464,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1350.81,1071.81,0.0 +gfx950,256,16384,6144,1536,ck,13,0,211.6928,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1460.78,1114.49,0.0 diff --git a/aiter/configs/model_configs/glm5_bf16_tuned_gemm.csv b/aiter/configs/model_configs/glm5_bf16_tuned_gemm.csv index e865d7c8ac..7995ce7f0d 100644 --- a/aiter/configs/model_configs/glm5_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/glm5_bf16_tuned_gemm.csv @@ -1,89 +1,89 @@ -cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.045,auto,0.0,0.02,17.6 -256,2,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.0376,auto,0.0,0.03,18.14 -256,4,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.0018,auto,0.0,0.07,19.24 -256,8,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.0751,auto,0.0,0.14,21.32 -256,16,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,16.6651,auto,0.0,0.38,35.45 -256,32,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,18.2784,auto,0.0,0.69,43.14 -256,48,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,22.8772,auto,0.0,0.83,43.1 -256,64,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.1289,auto,0.0,1.09,51.18 -256,128,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,34.4386,auto,0.0,1.46,57.33 -256,256,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,55.8106,auto,0.0,1.8,63.7 -256,16384,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,116.3523,auto,0.0,55.37,1742.71 -256,1,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.388,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0469,0.29,294.25 -256,2,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.4358,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0273,0.58,293.97 -256,4,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.433,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0293,1.16,298.74 -256,8,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.4742,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0322,2.3,305.65 -256,16,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,6.0986,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0288,4.13,290.82 -256,32,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,6.0842,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0237,8.27,324.49 -256,48,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,16,5.9414,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.028,12.71,366.07 -256,64,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,6.8395,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0316,14.72,347.35 -256,128,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,7.8346,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0308,25.7,405.7 -256,256,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,12,9.9652,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0276,40.41,480.08 -256,16384,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,65.0045,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,396.43,3185.84 -256,1,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,12,7.3627,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0391,0.43,428.99 -256,2,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,7.5667,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0273,0.83,419.12 -256,4,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,7.2016,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0254,1.75,443.92 -256,8,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.3839,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0303,3.41,439.89 -256,16,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.7579,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0291,6.49,431.89 -256,32,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,7.6427,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0272,13.17,465.19 -256,48,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,13,7.7081,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0267,19.59,487.82 -256,64,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,15,8.0088,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0294,25.14,495.07 -256,128,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,12,10.0663,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0257,40.0,475.26 -256,256,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,8,12.1464,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0203,66.3,528.76 -256,16384,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,86.0749,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,598.78,2472.97 -256,1,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,362,16,12.1556,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0278,2.65,2654.02 -256,2,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,11.1667,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.028,5.77,2890.63 -256,4,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,292,16,11.8259,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0291,10.91,2732.46 -256,8,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,16,12.4589,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0304,20.7,2599.27 -256,16,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.6223,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0203,44.39,2798.44 -256,32,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.5445,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0204,76.18,2422.01 -256,48,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.2686,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0176,108.47,2318.76 -256,64,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.6224,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0175,141.13,2281.84 -256,128,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,18.7174,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0101,220.5,1842.58 -256,256,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,25.7364,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0057,320.73,1427.28 -256,16384,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,75,1,758.9775,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,696.04,421.03 -256,1,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.3424,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.009,1.8,1797.13 -256,2,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.0829,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,3.69,1849.83 -256,4,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.3991,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0079,7.14,1790.21 -256,8,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.2796,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0091,14.46,1818.56 -256,16,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.7359,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0088,27.57,1743.43 -256,32,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.7262,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0088,55.2,1765.38 -256,48,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,4,10.1698,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0089,79.19,1707.71 -256,64,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,4,10.4146,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0089,103.1,1686.44 -256,128,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,11.8652,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0034,180.99,1546.55 -256,256,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,16.5474,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,259.56,1203.99 -256,16384,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,213.9689,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1284.66,1019.32 -256,1,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,82,8,12.0812,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0163,3.12,3126.11 -256,1,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,12.0447,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0176,4.18,4180.44 -256,1,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.2005,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0205,4.66,4661.71 -256,2,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,290,16,11.5991,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0295,6.51,3257.63 -256,2,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,85,16,12.6809,flydsl_gemm2_abf16_wbf16_bf16_t16x128x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0273,7.94,3972.32 -256,2,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.3454,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0201,9.24,4621.89 -256,4,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,387,8,12.6496,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0177,11.94,2990.01 -256,4,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.3794,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0199,16.26,4072.38 -256,4,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.5427,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0203,18.26,4569.74 -256,8,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,114,4,12.8874,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0106,23.43,2940.56 -256,8,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,13.2214,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.012,30.45,3819.22 -256,8,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.9831,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,35.56,4457.02 -256,16,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.6886,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0179,47.6,2998.25 -256,16,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,91,4,13.4396,flydsl_gemm2_abf16_wbf16_bf16_t16x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0119,59.92,3769.41 -256,16,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,17.8209,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0131,67.78,4258.52 -256,32,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,13.4164,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0106,90.04,2857.59 -256,32,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,15.3685,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0118,104.8,3317.63 -256,32,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,19.7695,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0132,122.2,3858.67 -256,48,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,2,15.8977,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0042,113.97,2430.13 -256,48,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,19.2217,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0116,125.69,2669.62 -256,48,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,23.8084,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0133,152.21,3220.59 -256,64,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,16.398,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.004,147.33,2373.97 -256,64,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,19.8091,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0049,162.61,2607.0 -256,64,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,25.8376,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0056,187.01,2982.88 -256,128,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.581,auto,0.0,223.89,1858.49 -256,128,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,27.5675,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,233.7,1920.85 -256,128,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,38.984,auto,0.0,247.89,2017.32 -256,256,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,29.8896,auto,0.0,323.31,1420.81 -256,256,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,37.6774,auto,0.0,341.98,1475.01 -256,256,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,55.4977,auto,0.0,348.26,1473.74 -256,16384,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,441.2049,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1401.79,770.02 -256,16384,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,824.3818,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1500.46,580.01 -256,1,38720,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,170,1,89.7892,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,5.3,5299.98 +gfx,cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +gfx950,256,1,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.045,auto,0.0,0.02,17.6 +gfx950,256,2,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.0376,auto,0.0,0.03,18.14 +gfx950,256,4,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.0018,auto,0.0,0.07,19.24 +gfx950,256,8,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.0751,auto,0.0,0.14,21.32 +gfx950,256,16,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,16.6651,auto,0.0,0.38,35.45 +gfx950,256,32,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,18.2784,auto,0.0,0.69,43.14 +gfx950,256,48,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,22.8772,auto,0.0,0.83,43.1 +gfx950,256,64,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.1289,auto,0.0,1.09,51.18 +gfx950,256,128,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,34.4386,auto,0.0,1.46,57.33 +gfx950,256,256,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,55.8106,auto,0.0,1.8,63.7 +gfx950,256,16384,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,116.3523,auto,0.0,55.37,1742.71 +gfx950,256,1,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.388,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0469,0.29,294.25 +gfx950,256,2,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.4358,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0273,0.58,293.97 +gfx950,256,4,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.433,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0293,1.16,298.74 +gfx950,256,8,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.4742,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0322,2.3,305.65 +gfx950,256,16,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,6.0986,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0288,4.13,290.82 +gfx950,256,32,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,6.0842,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0237,8.27,324.49 +gfx950,256,48,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,16,5.9414,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.028,12.71,366.07 +gfx950,256,64,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,6.8395,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0316,14.72,347.35 +gfx950,256,128,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,7.8346,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0308,25.7,405.7 +gfx950,256,256,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,12,9.9652,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0276,40.41,480.08 +gfx950,256,16384,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,65.0045,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,396.43,3185.84 +gfx950,256,1,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,12,7.3627,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0391,0.43,428.99 +gfx950,256,2,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,7.5667,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0273,0.83,419.12 +gfx950,256,4,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,7.2016,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0254,1.75,443.92 +gfx950,256,8,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.3839,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0303,3.41,439.89 +gfx950,256,16,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.7579,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0291,6.49,431.89 +gfx950,256,32,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,7.6427,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0272,13.17,465.19 +gfx950,256,48,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,13,7.7081,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0267,19.59,487.82 +gfx950,256,64,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,15,8.0088,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0294,25.14,495.07 +gfx950,256,128,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,12,10.0663,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0257,40.0,475.26 +gfx950,256,256,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,8,12.1464,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0203,66.3,528.76 +gfx950,256,16384,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,86.0749,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,598.78,2472.97 +gfx950,256,1,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,362,16,12.1556,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0278,2.65,2654.02 +gfx950,256,2,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,11.1667,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.028,5.77,2890.63 +gfx950,256,4,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,292,16,11.8259,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0291,10.91,2732.46 +gfx950,256,8,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,16,12.4589,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0304,20.7,2599.27 +gfx950,256,16,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.6223,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0203,44.39,2798.44 +gfx950,256,32,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.5445,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0204,76.18,2422.01 +gfx950,256,48,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.2686,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0176,108.47,2318.76 +gfx950,256,64,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.6224,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0175,141.13,2281.84 +gfx950,256,128,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,18.7174,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0101,220.5,1842.58 +gfx950,256,256,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,25.7364,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0057,320.73,1427.28 +gfx950,256,16384,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,75,1,758.9775,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,696.04,421.03 +gfx950,256,1,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.3424,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.009,1.8,1797.13 +gfx950,256,2,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.0829,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,3.69,1849.83 +gfx950,256,4,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.3991,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0079,7.14,1790.21 +gfx950,256,8,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.2796,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0091,14.46,1818.56 +gfx950,256,16,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.7359,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0088,27.57,1743.43 +gfx950,256,32,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.7262,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0088,55.2,1765.38 +gfx950,256,48,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,4,10.1698,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0089,79.19,1707.71 +gfx950,256,64,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,4,10.4146,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0089,103.1,1686.44 +gfx950,256,128,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,11.8652,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0034,180.99,1546.55 +gfx950,256,256,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,16.5474,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,259.56,1203.99 +gfx950,256,16384,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,213.9689,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1284.66,1019.32 +gfx950,256,1,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,82,8,12.0812,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0163,3.12,3126.11 +gfx950,256,1,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,12.0447,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0176,4.18,4180.44 +gfx950,256,1,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.2005,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0205,4.66,4661.71 +gfx950,256,2,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,290,16,11.5991,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0295,6.51,3257.63 +gfx950,256,2,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,85,16,12.6809,flydsl_gemm2_abf16_wbf16_bf16_t16x128x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0273,7.94,3972.32 +gfx950,256,2,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.3454,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0201,9.24,4621.89 +gfx950,256,4,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,387,8,12.6496,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0177,11.94,2990.01 +gfx950,256,4,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.3794,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0199,16.26,4072.38 +gfx950,256,4,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.5427,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0203,18.26,4569.74 +gfx950,256,8,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,114,4,12.8874,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0106,23.43,2940.56 +gfx950,256,8,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,13.2214,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.012,30.45,3819.22 +gfx950,256,8,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.9831,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,35.56,4457.02 +gfx950,256,16,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.6886,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0179,47.6,2998.25 +gfx950,256,16,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,91,4,13.4396,flydsl_gemm2_abf16_wbf16_bf16_t16x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0119,59.92,3769.41 +gfx950,256,16,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,17.8209,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0131,67.78,4258.52 +gfx950,256,32,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,13.4164,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0106,90.04,2857.59 +gfx950,256,32,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,15.3685,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0118,104.8,3317.63 +gfx950,256,32,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,19.7695,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0132,122.2,3858.67 +gfx950,256,48,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,2,15.8977,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0042,113.97,2430.13 +gfx950,256,48,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,19.2217,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0116,125.69,2669.62 +gfx950,256,48,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,23.8084,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0133,152.21,3220.59 +gfx950,256,64,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,16.398,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.004,147.33,2373.97 +gfx950,256,64,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,19.8091,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0049,162.61,2607.0 +gfx950,256,64,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,25.8376,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0056,187.01,2982.88 +gfx950,256,128,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.581,auto,0.0,223.89,1858.49 +gfx950,256,128,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,27.5675,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,233.7,1920.85 +gfx950,256,128,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,38.984,auto,0.0,247.89,2017.32 +gfx950,256,256,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,29.8896,auto,0.0,323.31,1420.81 +gfx950,256,256,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,37.6774,auto,0.0,341.98,1475.01 +gfx950,256,256,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,55.4977,auto,0.0,348.26,1473.74 +gfx950,256,16384,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,441.2049,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1401.79,770.02 +gfx950,256,16384,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,824.3818,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1500.46,580.01 +gfx950,256,1,38720,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,170,1,89.7892,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,5.3,5299.98 diff --git a/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv b/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv index d6f8b7baad..1deaa684f9 100644 --- a/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv @@ -1,58 +1,58 @@ -cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.9558,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0234,0.15,149.99 -256,2,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,4.9466,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0195,0.3,151.48 -256,4,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.9687,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0156,0.59,153.23 -256,8,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.9927,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0176,1.18,157.31 -256,16,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,5.031,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0171,2.34,165.68 -256,32,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,4.6354,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0203,5.09,200.59 -256,48,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,5.2547,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0212,6.73,195.26 -256,64,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,13,5.3561,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.021,8.81,209.54 -256,80,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,13,5.6419,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0218,10.45,215.98 -256,96,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,9,5.7166,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0163,12.38,230.0 -256,112,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,9,5.9183,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0165,13.95,238.43 -256,128,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,9,5.97,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0172,15.81,252.48 -256,256,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,7.0187,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0126,26.89,324.47 -256,1,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.6772,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0105,1.52,1524.87 -256,2,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.8371,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0107,3.0,1501.19 -256,4,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.8551,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0118,5.98,1500.66 -256,8,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.9035,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0101,11.91,1497.72 -256,16,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,10.0897,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.01,23.38,1478.7 -256,32,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,10.2307,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.01,46.12,1475.34 -256,48,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,10.7334,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0073,65.94,1422.46 -256,64,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,10.6753,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0074,88.4,1446.51 -256,128,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,11.6504,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0074,162.01,1385.21 -256,256,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,15.3742,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0041,245.53,1140.28 -256,1,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.0917,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0083,1.3,1298.58 -256,1,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,10.0871,auto,0.0,2.34,2340.31 -256,2,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,9.2607,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0056,2.55,1275.95 -256,2,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,9.6663,auto,0.0,4.88,2443.63 -256,4,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.1797,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0091,5.14,1289.36 -256,4,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,10.1637,auto,0.0,9.29,2326.79 -256,8,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.2315,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0092,10.22,1286.39 -256,8,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,11.4653,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0132,16.46,2067.51 -256,16,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,9.3253,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0062,20.24,1281.91 -256,16,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,11.591,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0136,32.57,2054.71 -256,32,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,9.2671,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0065,40.73,1306.98 -256,32,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,11.7821,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0137,64.08,2040.33 -256,48,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,4,9.8145,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.009,57.69,1250.15 -256,48,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,12.1519,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0136,93.19,1996.61 -256,64,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,4,10.1075,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.009,74.69,1229.51 -256,64,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,12.6689,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0137,119.19,1932.76 -256,80,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,13.1447,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0137,143.59,1879.78 -256,96,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,13.5787,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0136,166.8,1836.14 -256,112,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,17.4374,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0048,151.54,1442.62 -256,128,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,11.3635,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0035,132.88,1149.12 -256,128,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,17.3233,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0048,174.33,1465.01 -256,256,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,15.5453,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,194.26,921.15 -256,1,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.5513,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,2.55,2554.45 -256,2,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.4061,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0075,5.17,2588.37 -256,4,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.0536,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0075,9.79,2451.98 -256,8,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.96,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0077,19.73,2476.52 -256,16,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.2044,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0077,38.66,2437.42 -256,32,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.3457,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,76.44,2430.26 -256,48,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,13.0203,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0076,108.72,2324.0 -256,64,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,13.4435,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0077,140.4,2269.89 -256,80,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,14.2374,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0076,165.71,2161.29 -256,96,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,14.4747,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0076,195.59,2143.55 -256,112,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,19.1801,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,172.21,1631.02 -256,128,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,19.3026,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,195.56,1633.94 +gfx,cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +gfx950,256,1,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.9558,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0234,0.15,149.99 +gfx950,256,2,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,4.9466,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0195,0.3,151.48 +gfx950,256,4,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.9687,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0156,0.59,153.23 +gfx950,256,8,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.9927,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0176,1.18,157.31 +gfx950,256,16,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,5.031,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0171,2.34,165.68 +gfx950,256,32,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,4.6354,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0203,5.09,200.59 +gfx950,256,48,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,5.2547,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0212,6.73,195.26 +gfx950,256,64,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,13,5.3561,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.021,8.81,209.54 +gfx950,256,80,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,13,5.6419,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0218,10.45,215.98 +gfx950,256,96,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,9,5.7166,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0163,12.38,230.0 +gfx950,256,112,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,9,5.9183,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0165,13.95,238.43 +gfx950,256,128,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,9,5.97,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0172,15.81,252.48 +gfx950,256,256,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,7.0187,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0126,26.89,324.47 +gfx950,256,1,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.6772,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0105,1.52,1524.87 +gfx950,256,2,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.8371,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0107,3.0,1501.19 +gfx950,256,4,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.8551,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0118,5.98,1500.66 +gfx950,256,8,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.9035,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0101,11.91,1497.72 +gfx950,256,16,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,10.0897,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.01,23.38,1478.7 +gfx950,256,32,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,10.2307,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.01,46.12,1475.34 +gfx950,256,48,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,10.7334,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0073,65.94,1422.46 +gfx950,256,64,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,10.6753,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0074,88.4,1446.51 +gfx950,256,128,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,11.6504,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0074,162.01,1385.21 +gfx950,256,256,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,15.3742,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0041,245.53,1140.28 +gfx950,256,1,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.0917,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0083,1.3,1298.58 +gfx950,256,1,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,10.0871,auto,0.0,2.34,2340.31 +gfx950,256,2,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,9.2607,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0056,2.55,1275.95 +gfx950,256,2,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,9.6663,auto,0.0,4.88,2443.63 +gfx950,256,4,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.1797,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0091,5.14,1289.36 +gfx950,256,4,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,10.1637,auto,0.0,9.29,2326.79 +gfx950,256,8,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.2315,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0092,10.22,1286.39 +gfx950,256,8,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,11.4653,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0132,16.46,2067.51 +gfx950,256,16,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,9.3253,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0062,20.24,1281.91 +gfx950,256,16,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,11.591,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0136,32.57,2054.71 +gfx950,256,32,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,9.2671,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0065,40.73,1306.98 +gfx950,256,32,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,11.7821,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0137,64.08,2040.33 +gfx950,256,48,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,4,9.8145,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.009,57.69,1250.15 +gfx950,256,48,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,12.1519,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0136,93.19,1996.61 +gfx950,256,64,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,4,10.1075,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.009,74.69,1229.51 +gfx950,256,64,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,12.6689,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0137,119.19,1932.76 +gfx950,256,80,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,13.1447,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0137,143.59,1879.78 +gfx950,256,96,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,13.5787,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0136,166.8,1836.14 +gfx950,256,112,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,17.4374,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0048,151.54,1442.62 +gfx950,256,128,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,11.3635,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0035,132.88,1149.12 +gfx950,256,128,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,17.3233,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0048,174.33,1465.01 +gfx950,256,256,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,15.5453,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,194.26,921.15 +gfx950,256,1,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.5513,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,2.55,2554.45 +gfx950,256,2,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.4061,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0075,5.17,2588.37 +gfx950,256,4,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.0536,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0075,9.79,2451.98 +gfx950,256,8,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.96,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0077,19.73,2476.52 +gfx950,256,16,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.2044,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0077,38.66,2437.42 +gfx950,256,32,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.3457,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,76.44,2430.26 +gfx950,256,48,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,13.0203,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0076,108.72,2324.0 +gfx950,256,64,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,13.4435,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0077,140.4,2269.89 +gfx950,256,80,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,14.2374,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0076,165.71,2161.29 +gfx950,256,96,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,14.4747,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0076,195.59,2143.55 +gfx950,256,112,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,19.1801,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,172.21,1631.02 +gfx950,256,128,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,19.3026,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,195.56,1633.94 diff --git a/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv b/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv index dbf7cda83d..4f593b38c9 100644 --- a/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv @@ -1,126 +1,126 @@ -cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,8,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.8287,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0316,5.63,718.62 -256,16,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.9866,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0308,11.03,719.54 -256,24,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.0995,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0318,16.31,724.43 -256,32,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.1003,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0298,21.75,739.28 -256,40,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.1775,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0303,26.93,747.07 -256,48,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.1109,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0225,29.0,683.8 -256,56,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,14,9.3551,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0279,32.95,678.86 -256,64,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.8653,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0312,39.74,730.0 -256,72,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.603,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,41.27,686.51 -256,80,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.2927,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,47.39,722.43 -256,88,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,14,9.9299,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0281,48.79,688.24 -256,96,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,14,10.5219,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0291,50.23,661.0 -256,104,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.7243,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,53.39,659.8 -256,112,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.0639,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,55.73,650.46 -256,120,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.5619,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,62.55,692.82 -256,128,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.0315,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0225,58.57,618.24 -256,136,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.9587,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,62.61,632.11 -256,144,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,9,12.4755,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0224,63.54,615.61 -256,168,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.5098,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0228,73.93,642.9 -256,176,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.823,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,75.56,636.62 -256,184,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.9575,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,78.17,639.33 -256,192,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.1135,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,80.6,640.94 -256,200,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.004,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,84.67,655.63 -256,208,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,12.9955,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,88.11,665.36 -256,216,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.5066,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,88.04,649.13 -256,224,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.6487,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,90.35,651.22 -256,232,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.4444,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,95.0,670.1 -256,240,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.5075,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,97.81,675.92 -256,248,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6196,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0227,100.24,679.23 -256,256,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.8836,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0185,101.51,675.02 -256,264,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0505,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0182,103.44,675.6 -256,272,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.142,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0185,105.88,679.77 -256,280,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0565,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0184,109.66,692.5 -256,288,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.17,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.019,111.89,695.48 -256,296,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0038,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0186,116.36,712.36 -256,304,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.3644,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0187,116.51,702.89 -256,312,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.2119,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0189,120.85,718.94 -256,320,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0163,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0187,125.68,737.59 -256,328,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.1727,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0188,127.4,737.98 -256,336,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.2772,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0162,129.56,741.04 -256,344,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.3755,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0184,131.73,744.38 -256,352,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.5901,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,132.81,741.71 -256,360,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.4607,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0187,137.05,756.7 -256,368,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.5298,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0186,139.43,761.42 -256,376,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5702,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,142.06,767.6 -256,384,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.6341,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0188,144.45,772.51 -256,392,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.7176,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,146.63,776.34 -256,400,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.7254,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,149.54,784.13 -256,408,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.8111,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0161,151.65,787.75 -256,416,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.8643,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,154.07,793.06 -256,424,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.0387,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,155.21,791.9 -256,432,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9946,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,158.6,802.29 -256,440,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9368,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,162.16,813.48 -256,448,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.0012,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,164.4,818.04 -256,456,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.275,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,164.34,811.29 -256,464,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.3249,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0164,166.68,816.53 -256,472,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.4675,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,167.99,816.82 -256,480,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.4382,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,171.16,826.19 -256,488,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.4672,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0164,173.69,832.46 -256,496,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.5009,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,176.15,838.44 -256,504,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.5875,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,178.0,841.54 -256,512,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5896,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.016,180.8,849.17 -256,1,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.4015,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0322,1.75,1749.26 -256,2,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.9899,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0269,3.67,1841.43 -256,8,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.6843,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0272,13.52,1705.51 -256,16,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.3784,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0298,28.03,1783.42 -256,24,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.5427,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0212,36.92,1579.56 -256,40,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.9028,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0215,59.3,1548.59 -256,48,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.8668,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0211,64.84,1423.28 -256,56,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.662,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0208,77.1,1462.91 -256,64,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,177,8,12.0685,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0211,77.85,1303.28 -256,72,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.1302,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,80.5,1207.88 -256,80,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.4925,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,87.04,1185.16 -256,88,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.6987,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,94.3,1176.89 -256,96,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,14.0943,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0158,99.99,1153.16 -256,104,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.2129,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.018,107.42,1152.76 -256,112,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,14.2435,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0194,115.43,1159.48 -256,120,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.3959,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0181,122.37,1156.31 -256,128,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5892,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,128.8,1149.98 -256,136,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.714,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0162,135.69,1149.13 -256,144,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.569,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,145.1,1169.56 -256,152,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,7,14.7479,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0206,151.3,1164.26 -256,160,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,14.9606,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0184,157.0,1156.47 -256,168,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9201,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,165.3,1168.4 -256,176,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.243,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,169.5,1152.24 -256,184,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.2612,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,176.99,1159.46 -256,192,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.3106,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,184.09,1164.28 -256,200,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.415,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,190.46,1164.9 -256,208,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5644,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,196.18,1162.14 -256,216,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.6173,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,203.04,1166.59 -256,224,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.9662,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,205.96,1149.31 -256,232,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.8927,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,214.3,1162.87 -256,240,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.9019,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,221.56,1170.44 -256,248,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.1733,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,225.1,1158.9 -256,256,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.642,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0163,225.82,1134.14 -256,264,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.4312,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,235.86,1156.67 -256,272,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.4581,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0161,242.61,1162.74 -256,280,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.5508,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0163,248.35,1164.15 -256,288,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.6374,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,254.12,1165.97 -256,296,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.3769,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,250.06,1123.89 -256,304,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.5554,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,254.21,1119.93 -256,312,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.6566,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0137,259.4,1120.93 -256,320,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.8904,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,262.58,1113.61 -256,328,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.1946,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,264.64,1102.2 -256,336,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.2072,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,270.91,1108.63 -256,344,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.2544,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,276.64,1112.95 -256,376,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8057,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,293.51,1108.2 -256,384,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8317,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,299.34,1113.63 -256,4,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,8.82,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0279,13.73,3441.26 -256,8,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.9139,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0185,22.19,2787.83 -256,16,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.4057,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0223,46.56,2938.25 -256,24,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.8415,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,56.59,2392.48 -256,32,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.4509,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,72.03,2295.13 -256,40,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,14.577,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,83.08,2128.01 -256,48,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.7667,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0181,98.42,2110.73 -256,56,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,15.4235,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0181,109.93,2030.47 -256,72,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,15.6998,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0177,138.85,2013.65 -256,88,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,7,16.2598,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.02,163.87,1962.56 -256,360,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,7.4475,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,202.75,1008.67 -256,1,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.3991,auto,0.0,1.36,1362.34 -256,48,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,170,1,6.2363,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,56.5,1295.21 -256,72,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.6171,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,79.87,1276.38 -256,80,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.5744,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,89.32,1303.36 -256,120,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,6.9203,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,127.28,1327.0 -256,128,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.3033,auto,0.0,149.05,1476.39 +gfx,cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +gfx950,256,8,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.8287,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0316,5.63,718.62 +gfx950,256,16,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.9866,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0308,11.03,719.54 +gfx950,256,24,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.0995,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0318,16.31,724.43 +gfx950,256,32,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.1003,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0298,21.75,739.28 +gfx950,256,40,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.1775,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0303,26.93,747.07 +gfx950,256,48,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.1109,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0225,29.0,683.8 +gfx950,256,56,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,14,9.3551,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0279,32.95,678.86 +gfx950,256,64,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.8653,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0312,39.74,730.0 +gfx950,256,72,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.603,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,41.27,686.51 +gfx950,256,80,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.2927,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,47.39,722.43 +gfx950,256,88,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,14,9.9299,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0281,48.79,688.24 +gfx950,256,96,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,14,10.5219,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0291,50.23,661.0 +gfx950,256,104,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.7243,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,53.39,659.8 +gfx950,256,112,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.0639,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,55.73,650.46 +gfx950,256,120,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.5619,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,62.55,692.82 +gfx950,256,128,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.0315,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0225,58.57,618.24 +gfx950,256,136,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.9587,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,62.61,632.11 +gfx950,256,144,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,9,12.4755,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0224,63.54,615.61 +gfx950,256,168,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.5098,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0228,73.93,642.9 +gfx950,256,176,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.823,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,75.56,636.62 +gfx950,256,184,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.9575,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,78.17,639.33 +gfx950,256,192,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.1135,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,80.6,640.94 +gfx950,256,200,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.004,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,84.67,655.63 +gfx950,256,208,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,12.9955,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,88.11,665.36 +gfx950,256,216,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.5066,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,88.04,649.13 +gfx950,256,224,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.6487,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,90.35,651.22 +gfx950,256,232,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.4444,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,95.0,670.1 +gfx950,256,240,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.5075,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,97.81,675.92 +gfx950,256,248,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6196,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0227,100.24,679.23 +gfx950,256,256,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.8836,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0185,101.51,675.02 +gfx950,256,264,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0505,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0182,103.44,675.6 +gfx950,256,272,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.142,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0185,105.88,679.77 +gfx950,256,280,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0565,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0184,109.66,692.5 +gfx950,256,288,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.17,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.019,111.89,695.48 +gfx950,256,296,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0038,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0186,116.36,712.36 +gfx950,256,304,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.3644,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0187,116.51,702.89 +gfx950,256,312,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.2119,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0189,120.85,718.94 +gfx950,256,320,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0163,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0187,125.68,737.59 +gfx950,256,328,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.1727,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0188,127.4,737.98 +gfx950,256,336,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.2772,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0162,129.56,741.04 +gfx950,256,344,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.3755,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0184,131.73,744.38 +gfx950,256,352,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.5901,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,132.81,741.71 +gfx950,256,360,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.4607,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0187,137.05,756.7 +gfx950,256,368,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.5298,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0186,139.43,761.42 +gfx950,256,376,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5702,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,142.06,767.6 +gfx950,256,384,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.6341,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0188,144.45,772.51 +gfx950,256,392,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.7176,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,146.63,776.34 +gfx950,256,400,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.7254,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,149.54,784.13 +gfx950,256,408,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.8111,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0161,151.65,787.75 +gfx950,256,416,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.8643,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,154.07,793.06 +gfx950,256,424,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.0387,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,155.21,791.9 +gfx950,256,432,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9946,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,158.6,802.29 +gfx950,256,440,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9368,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,162.16,813.48 +gfx950,256,448,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.0012,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,164.4,818.04 +gfx950,256,456,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.275,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,164.34,811.29 +gfx950,256,464,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.3249,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0164,166.68,816.53 +gfx950,256,472,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.4675,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,167.99,816.82 +gfx950,256,480,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.4382,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,171.16,826.19 +gfx950,256,488,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.4672,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0164,173.69,832.46 +gfx950,256,496,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.5009,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,176.15,838.44 +gfx950,256,504,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.5875,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,178.0,841.54 +gfx950,256,512,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5896,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.016,180.8,849.17 +gfx950,256,1,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.4015,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0322,1.75,1749.26 +gfx950,256,2,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.9899,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0269,3.67,1841.43 +gfx950,256,8,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.6843,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0272,13.52,1705.51 +gfx950,256,16,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.3784,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0298,28.03,1783.42 +gfx950,256,24,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.5427,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0212,36.92,1579.56 +gfx950,256,40,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.9028,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0215,59.3,1548.59 +gfx950,256,48,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.8668,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0211,64.84,1423.28 +gfx950,256,56,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.662,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0208,77.1,1462.91 +gfx950,256,64,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,177,8,12.0685,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0211,77.85,1303.28 +gfx950,256,72,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.1302,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,80.5,1207.88 +gfx950,256,80,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.4925,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,87.04,1185.16 +gfx950,256,88,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.6987,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,94.3,1176.89 +gfx950,256,96,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,14.0943,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0158,99.99,1153.16 +gfx950,256,104,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.2129,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.018,107.42,1152.76 +gfx950,256,112,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,14.2435,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0194,115.43,1159.48 +gfx950,256,120,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.3959,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0181,122.37,1156.31 +gfx950,256,128,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5892,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,128.8,1149.98 +gfx950,256,136,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.714,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0162,135.69,1149.13 +gfx950,256,144,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.569,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,145.1,1169.56 +gfx950,256,152,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,7,14.7479,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0206,151.3,1164.26 +gfx950,256,160,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,14.9606,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0184,157.0,1156.47 +gfx950,256,168,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9201,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,165.3,1168.4 +gfx950,256,176,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.243,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,169.5,1152.24 +gfx950,256,184,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.2612,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,176.99,1159.46 +gfx950,256,192,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.3106,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,184.09,1164.28 +gfx950,256,200,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.415,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,190.46,1164.9 +gfx950,256,208,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5644,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,196.18,1162.14 +gfx950,256,216,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.6173,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,203.04,1166.59 +gfx950,256,224,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.9662,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,205.96,1149.31 +gfx950,256,232,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.8927,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,214.3,1162.87 +gfx950,256,240,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.9019,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,221.56,1170.44 +gfx950,256,248,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.1733,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,225.1,1158.9 +gfx950,256,256,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.642,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0163,225.82,1134.14 +gfx950,256,264,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.4312,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,235.86,1156.67 +gfx950,256,272,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.4581,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0161,242.61,1162.74 +gfx950,256,280,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.5508,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0163,248.35,1164.15 +gfx950,256,288,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.6374,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,254.12,1165.97 +gfx950,256,296,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.3769,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,250.06,1123.89 +gfx950,256,304,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.5554,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,254.21,1119.93 +gfx950,256,312,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.6566,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0137,259.4,1120.93 +gfx950,256,320,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.8904,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,262.58,1113.61 +gfx950,256,328,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.1946,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,264.64,1102.2 +gfx950,256,336,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.2072,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,270.91,1108.63 +gfx950,256,344,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.2544,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,276.64,1112.95 +gfx950,256,376,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8057,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,293.51,1108.2 +gfx950,256,384,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8317,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,299.34,1113.63 +gfx950,256,4,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,8.82,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0279,13.73,3441.26 +gfx950,256,8,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.9139,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0185,22.19,2787.83 +gfx950,256,16,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.4057,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0223,46.56,2938.25 +gfx950,256,24,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.8415,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,56.59,2392.48 +gfx950,256,32,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.4509,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,72.03,2295.13 +gfx950,256,40,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,14.577,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,83.08,2128.01 +gfx950,256,48,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.7667,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0181,98.42,2110.73 +gfx950,256,56,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,15.4235,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0181,109.93,2030.47 +gfx950,256,72,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,15.6998,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0177,138.85,2013.65 +gfx950,256,88,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,7,16.2598,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.02,163.87,1962.56 +gfx950,256,360,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,7.4475,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,202.75,1008.67 +gfx950,256,1,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.3991,auto,0.0,1.36,1362.34 +gfx950,256,48,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,170,1,6.2363,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,56.5,1295.21 +gfx950,256,72,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.6171,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,79.87,1276.38 +gfx950,256,80,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.5744,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,89.32,1303.36 +gfx950,256,120,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,6.9203,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,127.28,1327.0 +gfx950,256,128,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.3033,auto,0.0,149.05,1476.39 diff --git a/aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv b/aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv index f1b1e1eb26..b55672db6c 100644 --- a/aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv @@ -1,157 +1,157 @@ -cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,15.8275,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0321,4.77,4772.38 -256,16,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,17.7837,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0354,67.93,4278.95 -256,32,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,21.104,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0337,114.48,3634.07 -256,64,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,23.9613,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0226,201.65,3250.64 -256,128,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,35.5248,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.013,272.03,2259.87 -256,256,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,371,4,55.1808,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0161,350.26,1541.58 -256,512,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,98.3036,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,393.22,962.67 -256,1024,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,145.2088,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,532.4,783.5 -256,2048,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,238.9957,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,646.95,636.18 -256,4096,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,327.4554,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,944.37,698.08 -256,8192,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,677.7405,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,912.55,563.17 -256,16384,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1109.5955,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1114.78,619.92 -256,32768,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1981.8193,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1248.3,656.08 -256,1,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,29.5571,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0354,5.11,5110.0 -256,16,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,33.1405,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0335,72.9,4576.48 -256,32,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,38.2897,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0158,126.19,3978.57 -256,64,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,42.02,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0131,229.98,3657.35 -256,128,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,84,8,66.3889,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0244,291.12,2355.35 -256,256,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,331,4,100.3019,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0161,385.38,1612.56 -256,512,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,145.7193,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,530.54,1183.72 -256,1024,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,251.3842,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,615.07,771.67 -256,2048,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,298.7333,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1035.16,793.28 -256,4096,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,598.5374,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1033.31,539.58 -256,8192,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,953.3962,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1297.42,519.12 -256,16384,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1739.3206,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1422.34,482.29 -256,32768,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,3408.2792,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1451.7,447.95 -256,1,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,195,16,65.2799,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.031,4.63,4626.86 -256,16,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,68.6305,auto,0.0,70.4,4412.16 -256,32,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,73.2489,auto,0.0,131.93,4145.16 -256,64,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,241,4,84.768,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0159,228.0,3601.2 -256,128,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,26,4,110.2749,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0159,350.53,2797.95 -256,256,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,153.5055,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,503.63,2052.68 -256,512,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,262.2584,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,589.57,1251.45 -256,1024,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,286.2339,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1080.37,1238.21 -256,2048,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,597.7434,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1034.68,680.64 -256,4096,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,944.2369,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1310.0,541.92 -256,8192,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1710.6163,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1446.2,421.73 -256,16384,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,3321.8968,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1489.45,343.43 -256,32768,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,6749.5084,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1466.12,293.31 -256,1,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,195,16,82.8507,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0333,5.26,5265.7 -256,16,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,78.0998,auto,0.0,89.36,5597.43 -256,32,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,81.5625,auto,0.0,171.14,5371.44 -256,64,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,221,2,109.5182,flydsl_gemm2_abf16_wbf16_bf16_t64x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0074,254.91,4017.68 -256,128,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,6,2,135.6846,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0075,411.5,3270.89 -256,256,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,203.5247,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,548.68,2217.97 -256,512,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,281.769,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,792.63,1656.02 -256,1024,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,335.4867,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1331.43,1481.5 -256,2048,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,643.543,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1388.18,866.83 -256,4096,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1298.0192,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1376.49,523.47 -256,8192,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2452.1446,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1457.26,376.3 -256,16384,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4823.026,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1481.81,292.2 -256,32768,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9990.3785,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.74,238.47 -256,1,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,89,2,14.8968,flydsl_gemm2_abf16_wbf16_bf16_t16x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0028,4.5,4507.39 -256,1,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,25.8661,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0123,5.19,5190.53 -256,1,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,38.7993,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.014,5.62,5622.52 -256,1,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,50.4794,auto,0.0,5.32,5318.7 -256,1,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,73.9631,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0237,5.9,5898.44 -256,1,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,131,4,144.3838,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.019,6.04,6042.93 -256,16,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,14.4467,auto,0.0,74.32,4686.1 -256,16,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.7443,auto,0.0,90.44,5680.23 -256,16,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,37.3119,auto,0.0,93.53,5865.18 -256,16,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,44.7779,auto,0.0,95.92,6012.38 -256,16,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,70.9555,auto,0.0,98.36,6161.01 -256,16,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,146.0885,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0175,95.55,5981.25 -256,32,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,14.6697,auto,0.0,146.39,4655.07 -256,32,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,24.9617,auto,0.0,172.06,5429.46 -256,32,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,39.4472,auto,0.0,176.93,5566.39 -256,32,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,47.1387,auto,0.0,182.23,5727.95 -256,32,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.4473,auto,0.0,185.01,5806.81 -256,32,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,152.3247,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0173,183.27,5745.41 -256,64,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,18.9896,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,226.17,3658.22 -256,64,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,33.4354,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,256.91,4092.64 -256,64,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,50.5166,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,276.32,4375.85 -256,64,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,66.9741,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,256.52,4055.02 -256,64,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,94.2083,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,296.34,4670.59 -256,64,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,241,4,172.4583,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0172,323.76,5090.62 -256,128,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.3261,auto,0.0,368.25,3079.27 -256,128,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,42.8585,auto,0.0,400.85,3253.98 -256,128,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,69.0302,auto,0.0,404.42,3244.99 -256,128,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,81.6344,auto,0.0,420.9,3365.33 -256,128,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,6,2,123.6218,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0071,451.66,3590.06 -256,128,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,6,2,218.6242,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0082,510.78,4040.84 -256,256,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,32.8485,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,523.0,2330.28 -256,256,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,61.6719,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,557.14,2346.34 -256,256,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,94.7682,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,589.17,2425.92 -256,256,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,114.8498,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,598.34,2446.83 -256,256,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,179.7454,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,621.26,2511.4 -256,256,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,349.3089,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,639.37,2560.59 -256,512,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,47.2723,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,726.85,1818.89 -256,512,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,83.3091,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,824.87,1862.81 -256,512,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,127.3264,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,877.03,1898.25 -256,512,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,155.7206,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,882.6,1885.44 -256,512,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,235.6216,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,947.87,1980.36 -256,512,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,459.4132,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,972.28,1994.84 -256,1024,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,60.4762,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1136.31,1733.87 -256,1024,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,107.397,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1279.73,1640.28 -256,1024,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,164.2949,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1359.37,1614.72 -256,1024,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,197.3416,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1392.9,1615.31 -256,1024,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,312.555,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1429.11,1590.2 -256,1024,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,596.8639,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1496.75,1609.24 -256,2048,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,115.295,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1192.06,1236.88 -256,2048,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,201.0475,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1367.23,1084.84 -256,2048,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,312.2677,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.43,1000.67 -256,2048,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,370.4202,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1484.14,996.43 -256,2048,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,588.2507,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1518.66,948.31 -256,2048,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1158.5257,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1542.22,905.1 -256,4096,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,209.7933,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1310.23,1039.61 -256,4096,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,384.44,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.02,785.53 -256,4096,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,594.3976,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1502.96,684.47 -256,4096,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,715.0195,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1537.74,656.99 -256,4096,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1180.0684,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1514.07,575.79 -256,4096,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2315.9959,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1542.93,528.82 -256,8192,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,402.4127,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1366.15,917.21 -256,8192,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,735.8522,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1494.2,638.39 -256,8192,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1172.104,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1524.36,508.14 -256,8192,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1418.3458,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1550.41,473.15 -256,8192,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2350.1553,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1520.5,392.63 -256,8192,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4679.1879,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1527.36,337.04 -256,16384,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,791.7086,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1388.78,847.65 -256,16384,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1468.7844,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1497.17,548.28 -256,16384,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2338.6747,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1527.96,416.08 -256,16384,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2840.907,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1548.11,377.96 -256,16384,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4751.9617,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1503.97,296.57 -256,16384,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9811.8509,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1456.77,232.55 -256,32768,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1567.4338,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1402.94,813.48 -256,32768,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2904.4945,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1514.22,508.31 -256,32768,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4677.1636,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1528.03,369.47 -256,32768,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5762.8133,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1526.35,326.06 -256,32768,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9939.0603,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1438.13,239.7 -256,32768,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,20923.7314,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1366.26,176.4 -256,1,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,137,16,161.3276,flydsl_gemm2_abf16_wbf16_bf16_t32x256x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0338,5.41,5408.26 -256,16,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,150.2708,auto,0.0,92.89,5814.78 -256,32,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,159.5509,auto,0.0,174.97,5485.19 -256,64,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,253,2,184.9658,flydsl_gemm2_abf16_wbf16_bf16_t64x256x128_split_k2_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0074,301.86,4746.39 -256,128,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,224.7686,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,496.82,3930.38 -256,256,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,314.5993,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,709.91,2843.09 -256,512,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,354.3556,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1260.53,2586.26 -256,1024,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,656.6545,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1360.46,1462.71 -256,2048,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1299.8911,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1374.5,806.66 -256,4096,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2451.9553,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1457.37,499.49 -256,8192,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4861.9861,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1469.94,324.37 -256,16384,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9935.8309,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1438.6,229.64 -256,32768,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,20992.3261,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1361.8,175.83 -256,1,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,300.7058,auto,0.0,5.8,5802.91 -256,16,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,291.4735,auto,0.0,95.78,5993.89 -256,32,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,310.0882,auto,0.0,180.06,5641.26 -256,64,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,250,1,321.9447,flydsl_gemm2_abf16_wbf16_bf16_t64x256x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,346.86,5447.34 -256,128,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,362.9834,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,615.29,4856.02 -256,256,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,404.1413,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1105.25,4405.59 -256,512,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,692.1352,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1290.72,2623.96 -256,1024,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1298.7276,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1375.74,1453.3 -256,2048,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2445.8219,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1461.03,830.0 -256,4096,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4796.6589,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1489.96,482.68 -256,8192,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9742.2812,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1467.18,296.2 -256,16384,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,20953.6725,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1364.31,192.16 -256,32768,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,39969.08,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.47,157.83 +gfx,cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +gfx950,256,1,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,15.8275,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0321,4.77,4772.38 +gfx950,256,16,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,17.7837,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0354,67.93,4278.95 +gfx950,256,32,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,21.104,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0337,114.48,3634.07 +gfx950,256,64,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,23.9613,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0226,201.65,3250.64 +gfx950,256,128,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,35.5248,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.013,272.03,2259.87 +gfx950,256,256,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,371,4,55.1808,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0161,350.26,1541.58 +gfx950,256,512,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,98.3036,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,393.22,962.67 +gfx950,256,1024,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,145.2088,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,532.4,783.5 +gfx950,256,2048,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,238.9957,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,646.95,636.18 +gfx950,256,4096,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,327.4554,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,944.37,698.08 +gfx950,256,8192,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,677.7405,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,912.55,563.17 +gfx950,256,16384,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1109.5955,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1114.78,619.92 +gfx950,256,32768,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1981.8193,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1248.3,656.08 +gfx950,256,1,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,29.5571,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0354,5.11,5110.0 +gfx950,256,16,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,33.1405,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0335,72.9,4576.48 +gfx950,256,32,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,38.2897,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0158,126.19,3978.57 +gfx950,256,64,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,42.02,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0131,229.98,3657.35 +gfx950,256,128,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,84,8,66.3889,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0244,291.12,2355.35 +gfx950,256,256,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,331,4,100.3019,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0161,385.38,1612.56 +gfx950,256,512,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,145.7193,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,530.54,1183.72 +gfx950,256,1024,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,251.3842,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,615.07,771.67 +gfx950,256,2048,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,298.7333,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1035.16,793.28 +gfx950,256,4096,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,598.5374,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1033.31,539.58 +gfx950,256,8192,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,953.3962,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1297.42,519.12 +gfx950,256,16384,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1739.3206,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1422.34,482.29 +gfx950,256,32768,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,3408.2792,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1451.7,447.95 +gfx950,256,1,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,195,16,65.2799,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.031,4.63,4626.86 +gfx950,256,16,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,68.6305,auto,0.0,70.4,4412.16 +gfx950,256,32,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,73.2489,auto,0.0,131.93,4145.16 +gfx950,256,64,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,241,4,84.768,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0159,228.0,3601.2 +gfx950,256,128,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,26,4,110.2749,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0159,350.53,2797.95 +gfx950,256,256,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,153.5055,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,503.63,2052.68 +gfx950,256,512,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,262.2584,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,589.57,1251.45 +gfx950,256,1024,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,286.2339,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1080.37,1238.21 +gfx950,256,2048,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,597.7434,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1034.68,680.64 +gfx950,256,4096,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,944.2369,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1310.0,541.92 +gfx950,256,8192,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1710.6163,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1446.2,421.73 +gfx950,256,16384,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,3321.8968,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1489.45,343.43 +gfx950,256,32768,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,6749.5084,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1466.12,293.31 +gfx950,256,1,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,195,16,82.8507,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0333,5.26,5265.7 +gfx950,256,16,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,78.0998,auto,0.0,89.36,5597.43 +gfx950,256,32,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,81.5625,auto,0.0,171.14,5371.44 +gfx950,256,64,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,221,2,109.5182,flydsl_gemm2_abf16_wbf16_bf16_t64x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0074,254.91,4017.68 +gfx950,256,128,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,6,2,135.6846,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0075,411.5,3270.89 +gfx950,256,256,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,203.5247,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,548.68,2217.97 +gfx950,256,512,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,281.769,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,792.63,1656.02 +gfx950,256,1024,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,335.4867,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1331.43,1481.5 +gfx950,256,2048,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,643.543,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1388.18,866.83 +gfx950,256,4096,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1298.0192,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1376.49,523.47 +gfx950,256,8192,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2452.1446,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1457.26,376.3 +gfx950,256,16384,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4823.026,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1481.81,292.2 +gfx950,256,32768,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9990.3785,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.74,238.47 +gfx950,256,1,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,89,2,14.8968,flydsl_gemm2_abf16_wbf16_bf16_t16x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0028,4.5,4507.39 +gfx950,256,1,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,25.8661,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0123,5.19,5190.53 +gfx950,256,1,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,38.7993,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.014,5.62,5622.52 +gfx950,256,1,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,50.4794,auto,0.0,5.32,5318.7 +gfx950,256,1,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,73.9631,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0237,5.9,5898.44 +gfx950,256,1,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,131,4,144.3838,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.019,6.04,6042.93 +gfx950,256,16,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,14.4467,auto,0.0,74.32,4686.1 +gfx950,256,16,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.7443,auto,0.0,90.44,5680.23 +gfx950,256,16,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,37.3119,auto,0.0,93.53,5865.18 +gfx950,256,16,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,44.7779,auto,0.0,95.92,6012.38 +gfx950,256,16,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,70.9555,auto,0.0,98.36,6161.01 +gfx950,256,16,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,146.0885,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0175,95.55,5981.25 +gfx950,256,32,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,14.6697,auto,0.0,146.39,4655.07 +gfx950,256,32,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,24.9617,auto,0.0,172.06,5429.46 +gfx950,256,32,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,39.4472,auto,0.0,176.93,5566.39 +gfx950,256,32,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,47.1387,auto,0.0,182.23,5727.95 +gfx950,256,32,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.4473,auto,0.0,185.01,5806.81 +gfx950,256,32,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,152.3247,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0173,183.27,5745.41 +gfx950,256,64,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,18.9896,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,226.17,3658.22 +gfx950,256,64,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,33.4354,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,256.91,4092.64 +gfx950,256,64,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,50.5166,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,276.32,4375.85 +gfx950,256,64,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,66.9741,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,256.52,4055.02 +gfx950,256,64,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,94.2083,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,296.34,4670.59 +gfx950,256,64,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,241,4,172.4583,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0172,323.76,5090.62 +gfx950,256,128,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.3261,auto,0.0,368.25,3079.27 +gfx950,256,128,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,42.8585,auto,0.0,400.85,3253.98 +gfx950,256,128,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,69.0302,auto,0.0,404.42,3244.99 +gfx950,256,128,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,81.6344,auto,0.0,420.9,3365.33 +gfx950,256,128,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,6,2,123.6218,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0071,451.66,3590.06 +gfx950,256,128,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,6,2,218.6242,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0082,510.78,4040.84 +gfx950,256,256,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,32.8485,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,523.0,2330.28 +gfx950,256,256,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,61.6719,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,557.14,2346.34 +gfx950,256,256,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,94.7682,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,589.17,2425.92 +gfx950,256,256,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,114.8498,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,598.34,2446.83 +gfx950,256,256,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,179.7454,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,621.26,2511.4 +gfx950,256,256,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,349.3089,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,639.37,2560.59 +gfx950,256,512,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,47.2723,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,726.85,1818.89 +gfx950,256,512,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,83.3091,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,824.87,1862.81 +gfx950,256,512,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,127.3264,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,877.03,1898.25 +gfx950,256,512,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,155.7206,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,882.6,1885.44 +gfx950,256,512,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,235.6216,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,947.87,1980.36 +gfx950,256,512,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,459.4132,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,972.28,1994.84 +gfx950,256,1024,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,60.4762,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1136.31,1733.87 +gfx950,256,1024,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,107.397,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1279.73,1640.28 +gfx950,256,1024,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,164.2949,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1359.37,1614.72 +gfx950,256,1024,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,197.3416,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1392.9,1615.31 +gfx950,256,1024,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,312.555,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1429.11,1590.2 +gfx950,256,1024,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,596.8639,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1496.75,1609.24 +gfx950,256,2048,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,115.295,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1192.06,1236.88 +gfx950,256,2048,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,201.0475,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1367.23,1084.84 +gfx950,256,2048,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,312.2677,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.43,1000.67 +gfx950,256,2048,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,370.4202,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1484.14,996.43 +gfx950,256,2048,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,588.2507,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1518.66,948.31 +gfx950,256,2048,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1158.5257,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1542.22,905.1 +gfx950,256,4096,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,209.7933,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1310.23,1039.61 +gfx950,256,4096,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,384.44,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.02,785.53 +gfx950,256,4096,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,594.3976,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1502.96,684.47 +gfx950,256,4096,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,715.0195,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1537.74,656.99 +gfx950,256,4096,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1180.0684,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1514.07,575.79 +gfx950,256,4096,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2315.9959,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1542.93,528.82 +gfx950,256,8192,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,402.4127,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1366.15,917.21 +gfx950,256,8192,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,735.8522,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1494.2,638.39 +gfx950,256,8192,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1172.104,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1524.36,508.14 +gfx950,256,8192,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1418.3458,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1550.41,473.15 +gfx950,256,8192,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2350.1553,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1520.5,392.63 +gfx950,256,8192,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4679.1879,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1527.36,337.04 +gfx950,256,16384,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,791.7086,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1388.78,847.65 +gfx950,256,16384,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1468.7844,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1497.17,548.28 +gfx950,256,16384,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2338.6747,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1527.96,416.08 +gfx950,256,16384,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2840.907,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1548.11,377.96 +gfx950,256,16384,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4751.9617,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1503.97,296.57 +gfx950,256,16384,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9811.8509,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1456.77,232.55 +gfx950,256,32768,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1567.4338,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1402.94,813.48 +gfx950,256,32768,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2904.4945,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1514.22,508.31 +gfx950,256,32768,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4677.1636,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1528.03,369.47 +gfx950,256,32768,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5762.8133,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1526.35,326.06 +gfx950,256,32768,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9939.0603,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1438.13,239.7 +gfx950,256,32768,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,20923.7314,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1366.26,176.4 +gfx950,256,1,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,137,16,161.3276,flydsl_gemm2_abf16_wbf16_bf16_t32x256x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0338,5.41,5408.26 +gfx950,256,16,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,150.2708,auto,0.0,92.89,5814.78 +gfx950,256,32,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,159.5509,auto,0.0,174.97,5485.19 +gfx950,256,64,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,253,2,184.9658,flydsl_gemm2_abf16_wbf16_bf16_t64x256x128_split_k2_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0074,301.86,4746.39 +gfx950,256,128,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,224.7686,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,496.82,3930.38 +gfx950,256,256,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,314.5993,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,709.91,2843.09 +gfx950,256,512,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,354.3556,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1260.53,2586.26 +gfx950,256,1024,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,656.6545,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1360.46,1462.71 +gfx950,256,2048,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1299.8911,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1374.5,806.66 +gfx950,256,4096,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2451.9553,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1457.37,499.49 +gfx950,256,8192,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4861.9861,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1469.94,324.37 +gfx950,256,16384,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9935.8309,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1438.6,229.64 +gfx950,256,32768,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,20992.3261,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1361.8,175.83 +gfx950,256,1,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,300.7058,auto,0.0,5.8,5802.91 +gfx950,256,16,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,291.4735,auto,0.0,95.78,5993.89 +gfx950,256,32,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,310.0882,auto,0.0,180.06,5641.26 +gfx950,256,64,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,250,1,321.9447,flydsl_gemm2_abf16_wbf16_bf16_t64x256x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,346.86,5447.34 +gfx950,256,128,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,362.9834,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,615.29,4856.02 +gfx950,256,256,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,404.1413,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1105.25,4405.59 +gfx950,256,512,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,692.1352,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1290.72,2623.96 +gfx950,256,1024,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1298.7276,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1375.74,1453.3 +gfx950,256,2048,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2445.8219,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1461.03,830.0 +gfx950,256,4096,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4796.6589,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1489.96,482.68 +gfx950,256,8192,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9742.2812,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1467.18,296.2 +gfx950,256,16384,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,20953.6725,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1364.31,192.16 +gfx950,256,32768,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,39969.08,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.47,157.83 diff --git a/aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv b/aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv index 6c50c6bf1e..5fa7cdd523 100644 --- a/aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv @@ -1,157 +1,157 @@ -cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,64,192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,4.3722,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0126,5.76,125.53 -256,1,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,7.365,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0305,2.85,2850.03 -256,16,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,165,16,8.9131,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0292,37.65,2386.89 -256,32,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,16,10.9667,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0306,61.19,1967.57 -256,64,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6294,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,98.48,1627.65 -256,128,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,15.0329,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0179,178.57,1556.34 -256,256,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8414,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0141,284.94,1370.45 -256,512,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,29.8384,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0061,359.85,1027.9 -256,1024,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,52.4555,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,409.39,769.61 -256,2048,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,76.0093,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,565.06,786.34 -256,4096,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,75,1,125.2097,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,686.04,787.21 -256,8192,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,170.9303,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1005.08,1030.6 -256,16384,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,319.2235,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1076.35,1037.99 -256,32768,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,539.6049,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1273.51,1189.26 -256,1,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,9.9782,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.034,4.2,4205.62 -256,16,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.7557,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0208,57.09,3597.16 -256,32,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.7726,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0223,97.45,3095.36 -256,64,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,16.2682,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0185,165.01,2662.82 -256,128,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,21.6899,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0107,247.52,2060.66 -256,256,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,31.7649,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0061,338.03,1493.73 -256,512,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,53.2462,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,403.31,994.5 -256,1024,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,78.795,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,545.08,811.77 -256,2048,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,120.1702,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,714.81,715.51 -256,4096,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,163.7211,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1049.34,794.18 -256,8192,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,312.7178,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1098.75,697.45 -256,16384,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,515.5546,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1332.92,764.74 -256,32768,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,965.4849,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1423.52,773.28 -256,1,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,22.8083,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0232,5.15,5150.37 -256,16,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,25.8218,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0208,72.77,4567.15 -256,32,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,29.7832,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0139,126.18,3976.19 -256,64,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,34.0572,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0062,220.69,3506.06 -256,128,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,50.1861,auto,0.0,299.53,2418.45 -256,256,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,72.0475,auto,0.0,417.29,1739.2 -256,512,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,109.5379,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,548.94,1215.74 -256,1024,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,145.6016,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,825.95,1022.64 -256,2048,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,180.6066,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1331.72,998.61 -256,4096,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,343.387,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1400.86,708.44 -256,8192,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,670.3004,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1435.29,550.65 -256,16384,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1263.3316,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1523.07,491.37 -256,32768,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2526.7126,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1523.04,444.88 -256,1,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,103,8,6.882,flydsl_gemm2_abf16_wbf16_bf16_t16x256x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0099,2.44,2440.52 -256,1,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,134,8,8.7573,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0146,3.83,3833.93 -256,1,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,12.7476,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0219,4.61,4608.22 -256,1,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,22.184,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0198,5.29,5295.31 -256,1,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,103,8,26.5479,flydsl_gemm2_abf16_wbf16_bf16_t16x256x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0219,5.06,5056.92 -256,1,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,78.0769,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0173,6.02,6017.6 -256,16,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.3003,auto,0.0,36.77,2338.55 -256,16,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.2334,auto,0.0,47.79,3016.19 -256,16,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,14.7954,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0115,63.5,3994.29 -256,16,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,23.935,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0131,78.51,4927.18 -256,16,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,203,8,28.5135,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,75.31,4725.55 -256,16,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,79.2288,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0175,94.87,5944.07 -256,32,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.079,auto,0.0,75.84,2453.32 -256,32,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.4,auto,0.0,94.19,3000.86 -256,32,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,17.3213,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0048,108.48,3433.57 -256,32,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,27.5303,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0135,136.51,4301.57 -256,32,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,203,8,33.8365,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,126.93,3997.64 -256,32,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,82.9324,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0168,181.26,5692.85 -256,64,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.4356,auto,0.0,172.69,2803.66 -256,64,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,18.6886,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0048,201.09,3222.69 -256,64,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,30.8218,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0061,243.86,3874.1 -256,64,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,36.7821,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0062,233.54,3706.01 -256,64,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,97.8785,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0172,307.16,4847.65 -256,128,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,9.5917,auto,0.0,223.89,1995.11 -256,128,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,16.0441,auto,0.0,267.7,2254.78 -256,128,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,25.3581,auto,0.0,296.4,2434.52 -256,128,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,46.4994,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,323.28,2610.2 -256,128,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,51.0973,auto,0.0,336.22,2708.79 -256,128,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,9,4,131.5455,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.017,457.1,3642.84 -256,256,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,12.7252,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,337.52,1689.23 -256,256,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.9273,auto,0.0,391.75,1769.36 -256,256,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,35.5211,auto,0.0,423.2,1822.85 -256,256,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,67.4176,auto,0.0,445.95,1858.64 -256,256,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.099,auto,0.0,457.53,1898.91 -256,256,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,26,4,207.8212,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0172,578.67,2351.23 -256,512,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,18.1085,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,474.36,1447.63 -256,512,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,31.4793,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,545.75,1399.02 -256,512,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,53.4897,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,562.07,1323.22 -256,512,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,98.4178,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,610.96,1353.1 -256,512,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,122.3437,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,561.69,1234.19 -256,512,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,366.6769,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,655.94,1384.08 -256,1024,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,25.3837,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,676.81,1404.51 -256,1024,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,46.4383,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,739.9,1174.16 -256,1024,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,72.7344,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,826.7,1138.9 -256,1024,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,132.1089,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,910.3,1127.08 -256,1024,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,149.1713,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,921.35,1124.69 -256,1024,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,482.3198,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,997.34,1130.49 -256,2048,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,32.55,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1055.6,1675.14 -256,2048,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,60.4033,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1137.68,1249.89 -256,2048,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,95.4106,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1260.44,1120.99 -256,2048,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,171.7077,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1400.74,1050.36 -256,2048,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,192.9439,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1424.65,1043.45 -256,2048,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,627.237,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1533.83,989.67 -256,4096,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,65.1781,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1054.33,1415.73 -256,4096,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,113.0718,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1215.5,1038.64 -256,4096,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,177.8346,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1352.48,872.66 -256,4096,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,326.7272,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1472.29,744.56 -256,4096,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,366.8778,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1498.47,731.68 -256,4096,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1246.8325,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1543.23,618.97 -256,8192,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,121.9167,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1127.32,1376.12 -256,8192,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,209.0267,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1315.04,963.16 -256,8192,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,337.4416,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1425.54,745.78 -256,8192,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,630.3348,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1526.29,585.56 -256,8192,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,713.5721,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1540.86,564.28 -256,8192,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2497.3336,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1540.96,429.96 -256,16384,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,236.8495,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1160.56,1345.86 -256,16384,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,402.8317,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1364.73,916.26 -256,16384,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,657.6327,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1462.93,676.06 -256,16384,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1244.7931,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1545.76,498.68 -256,16384,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1410.2403,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1559.33,475.87 -256,16384,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5002.769,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1538.46,335.36 -256,32768,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,463.0824,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1187.17,1340.49 -256,32768,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,791.1639,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1389.74,890.64 -256,32768,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1300.2138,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1479.87,638.72 -256,32768,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2489.0786,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1546.07,451.6 -256,32768,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2825.562,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1556.52,427.51 -256,32768,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,10090.83,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1525.46,285.97 -256,1,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,134,8,32.691,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0223,5.13,5133.19 -256,16,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,134,8,36.3724,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0212,73.8,4628.84 -256,32,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,39.5825,auto,0.0,135.63,4268.35 -256,64,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,221,2,53.4577,flydsl_gemm2_abf16_wbf16_bf16_t64x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0062,200.86,3182.54 -256,128,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,69.083,auto,0.0,310.86,2496.86 -256,256,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,84.2787,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,509.61,2102.66 -256,512,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,142.8453,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,601.35,1306.63 -256,1024,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,156.5017,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1097.74,1313.22 -256,2048,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,314.5911,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1092.2,773.29 -256,4096,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,507.5975,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1353.82,627.99 -256,8192,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,973.6448,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1411.59,482.48 -256,16384,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1927.7017,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1425.94,400.35 -256,32768,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,3846.6163,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1429.19,357.65 -256,1,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,164,8,45.5191,flydsl_gemm2_abf16_wbf16_bf16_t32x256x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0234,5.16,5161.04 -256,16,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,41.3701,auto,0.0,90.84,5694.98 -256,32,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,43.433,auto,0.0,173.05,5441.09 -256,64,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,59.5058,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,252.62,3995.65 -256,128,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,77.6763,auto,0.0,387.05,3098.09 -256,256,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,107.7922,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,557.83,2286.02 -256,512,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,147.9651,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,812.75,1743.31 -256,1024,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,183.6583,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1309.6,1530.12 -256,2048,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,344.2741,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1397.25,950.28 -256,4096,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,672.6336,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.31,623.56 -256,8192,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1250.6163,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1538.56,482.95 -256,16384,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2489.7903,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1545.63,390.83 -256,32768,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4995.7199,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1540.64,342.55 -256,1,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,161.0117,auto,0.0,5.84,5835.94 -256,16,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,155.7742,auto,0.0,96.5,6044.78 -256,32,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,165.8506,auto,0.0,181.28,5690.17 -256,64,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,250,1,174.1356,flydsl_gemm2_abf16_wbf16_bf16_t64x256x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,345.3,5443.53 -256,128,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,193.2654,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,622.25,4948.12 -256,256,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,219.6217,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1095.15,4430.7 -256,512,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,381.9333,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1259.48,2635.63 -256,1024,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,693.4583,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1387.35,1548.39 -256,2048,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1276.0101,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1507.94,946.67 -256,4096,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2510.3525,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1532.97,588.12 -256,8192,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5008.9325,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1536.57,401.94 -256,16384,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,10204.9796,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1508.4,302.5 -256,32768,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,21168.2097,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1454.37,247.28 +gfx,cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +gfx950,256,64,192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,4.3722,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0126,5.76,125.53 +gfx950,256,1,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,7.365,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0305,2.85,2850.03 +gfx950,256,16,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,165,16,8.9131,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0292,37.65,2386.89 +gfx950,256,32,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,16,10.9667,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0306,61.19,1967.57 +gfx950,256,64,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6294,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,98.48,1627.65 +gfx950,256,128,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,15.0329,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0179,178.57,1556.34 +gfx950,256,256,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8414,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0141,284.94,1370.45 +gfx950,256,512,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,29.8384,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0061,359.85,1027.9 +gfx950,256,1024,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,52.4555,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,409.39,769.61 +gfx950,256,2048,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,76.0093,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,565.06,786.34 +gfx950,256,4096,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,75,1,125.2097,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,686.04,787.21 +gfx950,256,8192,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,170.9303,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1005.08,1030.6 +gfx950,256,16384,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,319.2235,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1076.35,1037.99 +gfx950,256,32768,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,539.6049,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1273.51,1189.26 +gfx950,256,1,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,9.9782,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.034,4.2,4205.62 +gfx950,256,16,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.7557,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0208,57.09,3597.16 +gfx950,256,32,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.7726,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0223,97.45,3095.36 +gfx950,256,64,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,16.2682,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0185,165.01,2662.82 +gfx950,256,128,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,21.6899,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0107,247.52,2060.66 +gfx950,256,256,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,31.7649,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0061,338.03,1493.73 +gfx950,256,512,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,53.2462,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,403.31,994.5 +gfx950,256,1024,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,78.795,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,545.08,811.77 +gfx950,256,2048,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,120.1702,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,714.81,715.51 +gfx950,256,4096,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,163.7211,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1049.34,794.18 +gfx950,256,8192,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,312.7178,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1098.75,697.45 +gfx950,256,16384,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,515.5546,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1332.92,764.74 +gfx950,256,32768,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,965.4849,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1423.52,773.28 +gfx950,256,1,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,22.8083,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0232,5.15,5150.37 +gfx950,256,16,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,25.8218,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0208,72.77,4567.15 +gfx950,256,32,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,29.7832,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0139,126.18,3976.19 +gfx950,256,64,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,34.0572,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0062,220.69,3506.06 +gfx950,256,128,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,50.1861,auto,0.0,299.53,2418.45 +gfx950,256,256,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,72.0475,auto,0.0,417.29,1739.2 +gfx950,256,512,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,109.5379,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,548.94,1215.74 +gfx950,256,1024,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,145.6016,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,825.95,1022.64 +gfx950,256,2048,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,180.6066,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1331.72,998.61 +gfx950,256,4096,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,343.387,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1400.86,708.44 +gfx950,256,8192,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,670.3004,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1435.29,550.65 +gfx950,256,16384,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1263.3316,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1523.07,491.37 +gfx950,256,32768,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2526.7126,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1523.04,444.88 +gfx950,256,1,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,103,8,6.882,flydsl_gemm2_abf16_wbf16_bf16_t16x256x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0099,2.44,2440.52 +gfx950,256,1,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,134,8,8.7573,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0146,3.83,3833.93 +gfx950,256,1,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,12.7476,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0219,4.61,4608.22 +gfx950,256,1,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,22.184,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0198,5.29,5295.31 +gfx950,256,1,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,103,8,26.5479,flydsl_gemm2_abf16_wbf16_bf16_t16x256x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0219,5.06,5056.92 +gfx950,256,1,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,78.0769,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0173,6.02,6017.6 +gfx950,256,16,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.3003,auto,0.0,36.77,2338.55 +gfx950,256,16,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.2334,auto,0.0,47.79,3016.19 +gfx950,256,16,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,14.7954,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0115,63.5,3994.29 +gfx950,256,16,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,23.935,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0131,78.51,4927.18 +gfx950,256,16,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,203,8,28.5135,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,75.31,4725.55 +gfx950,256,16,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,79.2288,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0175,94.87,5944.07 +gfx950,256,32,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.079,auto,0.0,75.84,2453.32 +gfx950,256,32,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.4,auto,0.0,94.19,3000.86 +gfx950,256,32,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,17.3213,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0048,108.48,3433.57 +gfx950,256,32,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,27.5303,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0135,136.51,4301.57 +gfx950,256,32,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,203,8,33.8365,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,126.93,3997.64 +gfx950,256,32,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,82.9324,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0168,181.26,5692.85 +gfx950,256,64,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.4356,auto,0.0,172.69,2803.66 +gfx950,256,64,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,18.6886,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0048,201.09,3222.69 +gfx950,256,64,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,30.8218,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0061,243.86,3874.1 +gfx950,256,64,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,36.7821,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0062,233.54,3706.01 +gfx950,256,64,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,97.8785,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0172,307.16,4847.65 +gfx950,256,128,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,9.5917,auto,0.0,223.89,1995.11 +gfx950,256,128,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,16.0441,auto,0.0,267.7,2254.78 +gfx950,256,128,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,25.3581,auto,0.0,296.4,2434.52 +gfx950,256,128,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,46.4994,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,323.28,2610.2 +gfx950,256,128,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,51.0973,auto,0.0,336.22,2708.79 +gfx950,256,128,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,9,4,131.5455,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.017,457.1,3642.84 +gfx950,256,256,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,12.7252,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,337.52,1689.23 +gfx950,256,256,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.9273,auto,0.0,391.75,1769.36 +gfx950,256,256,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,35.5211,auto,0.0,423.2,1822.85 +gfx950,256,256,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,67.4176,auto,0.0,445.95,1858.64 +gfx950,256,256,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.099,auto,0.0,457.53,1898.91 +gfx950,256,256,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,26,4,207.8212,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0172,578.67,2351.23 +gfx950,256,512,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,18.1085,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,474.36,1447.63 +gfx950,256,512,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,31.4793,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,545.75,1399.02 +gfx950,256,512,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,53.4897,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,562.07,1323.22 +gfx950,256,512,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,98.4178,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,610.96,1353.1 +gfx950,256,512,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,122.3437,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,561.69,1234.19 +gfx950,256,512,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,366.6769,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,655.94,1384.08 +gfx950,256,1024,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,25.3837,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,676.81,1404.51 +gfx950,256,1024,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,46.4383,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,739.9,1174.16 +gfx950,256,1024,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,72.7344,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,826.7,1138.9 +gfx950,256,1024,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,132.1089,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,910.3,1127.08 +gfx950,256,1024,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,149.1713,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,921.35,1124.69 +gfx950,256,1024,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,482.3198,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,997.34,1130.49 +gfx950,256,2048,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,32.55,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1055.6,1675.14 +gfx950,256,2048,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,60.4033,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1137.68,1249.89 +gfx950,256,2048,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,95.4106,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1260.44,1120.99 +gfx950,256,2048,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,171.7077,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1400.74,1050.36 +gfx950,256,2048,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,192.9439,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1424.65,1043.45 +gfx950,256,2048,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,627.237,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1533.83,989.67 +gfx950,256,4096,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,65.1781,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1054.33,1415.73 +gfx950,256,4096,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,113.0718,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1215.5,1038.64 +gfx950,256,4096,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,177.8346,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1352.48,872.66 +gfx950,256,4096,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,326.7272,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1472.29,744.56 +gfx950,256,4096,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,366.8778,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1498.47,731.68 +gfx950,256,4096,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1246.8325,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1543.23,618.97 +gfx950,256,8192,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,121.9167,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1127.32,1376.12 +gfx950,256,8192,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,209.0267,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1315.04,963.16 +gfx950,256,8192,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,337.4416,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1425.54,745.78 +gfx950,256,8192,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,630.3348,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1526.29,585.56 +gfx950,256,8192,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,713.5721,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1540.86,564.28 +gfx950,256,8192,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2497.3336,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1540.96,429.96 +gfx950,256,16384,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,236.8495,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1160.56,1345.86 +gfx950,256,16384,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,402.8317,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1364.73,916.26 +gfx950,256,16384,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,657.6327,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1462.93,676.06 +gfx950,256,16384,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1244.7931,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1545.76,498.68 +gfx950,256,16384,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1410.2403,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1559.33,475.87 +gfx950,256,16384,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5002.769,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1538.46,335.36 +gfx950,256,32768,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,463.0824,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1187.17,1340.49 +gfx950,256,32768,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,791.1639,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1389.74,890.64 +gfx950,256,32768,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1300.2138,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1479.87,638.72 +gfx950,256,32768,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2489.0786,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1546.07,451.6 +gfx950,256,32768,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2825.562,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1556.52,427.51 +gfx950,256,32768,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,10090.83,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1525.46,285.97 +gfx950,256,1,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,134,8,32.691,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0223,5.13,5133.19 +gfx950,256,16,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,134,8,36.3724,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0212,73.8,4628.84 +gfx950,256,32,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,39.5825,auto,0.0,135.63,4268.35 +gfx950,256,64,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,221,2,53.4577,flydsl_gemm2_abf16_wbf16_bf16_t64x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0062,200.86,3182.54 +gfx950,256,128,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,69.083,auto,0.0,310.86,2496.86 +gfx950,256,256,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,84.2787,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,509.61,2102.66 +gfx950,256,512,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,142.8453,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,601.35,1306.63 +gfx950,256,1024,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,156.5017,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1097.74,1313.22 +gfx950,256,2048,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,314.5911,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1092.2,773.29 +gfx950,256,4096,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,507.5975,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1353.82,627.99 +gfx950,256,8192,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,973.6448,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1411.59,482.48 +gfx950,256,16384,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1927.7017,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1425.94,400.35 +gfx950,256,32768,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,3846.6163,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1429.19,357.65 +gfx950,256,1,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,164,8,45.5191,flydsl_gemm2_abf16_wbf16_bf16_t32x256x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0234,5.16,5161.04 +gfx950,256,16,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,41.3701,auto,0.0,90.84,5694.98 +gfx950,256,32,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,43.433,auto,0.0,173.05,5441.09 +gfx950,256,64,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,59.5058,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,252.62,3995.65 +gfx950,256,128,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,77.6763,auto,0.0,387.05,3098.09 +gfx950,256,256,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,107.7922,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,557.83,2286.02 +gfx950,256,512,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,147.9651,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,812.75,1743.31 +gfx950,256,1024,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,183.6583,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1309.6,1530.12 +gfx950,256,2048,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,344.2741,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1397.25,950.28 +gfx950,256,4096,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,672.6336,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.31,623.56 +gfx950,256,8192,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1250.6163,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1538.56,482.95 +gfx950,256,16384,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2489.7903,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1545.63,390.83 +gfx950,256,32768,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4995.7199,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1540.64,342.55 +gfx950,256,1,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,161.0117,auto,0.0,5.84,5835.94 +gfx950,256,16,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,155.7742,auto,0.0,96.5,6044.78 +gfx950,256,32,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,165.8506,auto,0.0,181.28,5690.17 +gfx950,256,64,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,250,1,174.1356,flydsl_gemm2_abf16_wbf16_bf16_t64x256x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,345.3,5443.53 +gfx950,256,128,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,193.2654,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,622.25,4948.12 +gfx950,256,256,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,219.6217,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1095.15,4430.7 +gfx950,256,512,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,381.9333,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1259.48,2635.63 +gfx950,256,1024,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,693.4583,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1387.35,1548.39 +gfx950,256,2048,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1276.0101,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1507.94,946.67 +gfx950,256,4096,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2510.3525,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1532.97,588.12 +gfx950,256,8192,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5008.9325,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1536.57,401.94 +gfx950,256,16384,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,10204.9796,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1508.4,302.5 +gfx950,256,32768,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,21168.2097,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1454.37,247.28 diff --git a/aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv b/aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv index 475fdd74eb..6b6c232083 100644 --- a/aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv @@ -1,157 +1,157 @@ -cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,24.2594,auto,0.0,0.04,42.64 -256,16,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,18.9179,auto,0.0,0.87,62.96 -256,32,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,20.665,auto,0.0,1.59,65.72 -256,64,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.9084,auto,0.0,2.99,77.24 -256,128,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,31.8414,auto,0.0,4.12,74.13 -256,256,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,49.0384,auto,0.0,5.35,75.38 -256,512,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,50.1618,auto,0.0,10.45,126.97 -256,1024,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,132.7156,auto,0.0,7.9,88.27 -256,2048,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,131.8115,auto,0.0,15.91,169.98 -256,4096,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,101.6103,auto,0.0,41.28,430.92 -256,8192,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,101.715,auto,0.0,82.47,850.89 -256,16384,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,103.0716,auto,0.0,162.77,1669.45 -256,32768,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,110.6268,auto,0.0,303.31,3101.62 -256,1,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,24.9311,auto,0.0,0.08,82.57 -256,16,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,19.4155,auto,0.0,1.69,114.25 -256,32,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.489,auto,0.0,3.05,111.15 -256,64,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,22.9195,auto,0.0,5.72,119.07 -256,128,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,33.6484,auto,0.0,7.79,101.34 -256,256,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,48.3964,auto,0.0,10.83,98.6 -256,512,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,46.1295,auto,0.0,22.73,162.49 -256,1024,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,153.9655,auto,0.0,13.62,84.07 -256,2048,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,152.8232,auto,0.0,27.45,155.99 -256,4096,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,106.0943,auto,0.0,79.07,430.08 -256,8192,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,107.2911,auto,0.0,156.37,831.48 -256,16384,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,109.2356,auto,0.0,307.17,1614.62 -256,32768,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,120.4976,auto,0.0,556.93,2910.43 -256,1,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.9988,auto,0.0,0.34,341.84 -256,16,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,19.3848,auto,0.0,6.76,432.37 -256,32,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.7693,auto,0.0,12.04,393.71 -256,64,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.8795,auto,0.0,21.96,374.79 -256,128,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,33.711,auto,0.0,31.1,287.96 -256,256,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,47.7622,auto,0.0,43.91,234.98 -256,512,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.1338,auto,0.0,92.93,315.82 -256,1024,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,169.7397,auto,0.0,49.42,119.69 -256,2048,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,170.1752,auto,0.0,98.59,190.63 -256,4096,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,111.1677,auto,0.0,301.84,509.94 -256,8192,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,117.7072,auto,0.0,570.13,893.62 -256,16384,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,227.3613,auto,0.0,590.33,889.24 -256,32768,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,433.4434,auto,0.0,619.31,914.0 -256,1,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,7.0164,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0012,0.93,935.68 -256,1,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,129,4,7.8429,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.007,1.67,1672.85 -256,1,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.5446,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.008,2.84,2839.82 -256,1,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,232,16,12.6295,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0311,4.15,4152.92 -256,1,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,17.8122,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0098,3.68,3680.57 -256,1,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,45.3626,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0342,5.78,5780.21 -256,16,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.9722,auto,0.0,17.56,1128.21 -256,16,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.4685,auto,0.0,28.08,1782.42 -256,16,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.4765,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.008,42.02,2647.72 -256,16,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6344,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0207,61.53,3869.37 -256,16,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,18.6276,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0098,56.29,3538.01 -256,16,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,49.8337,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0247,84.17,5280.1 -256,32,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,170,1,7.3371,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,28.58,943.46 -256,32,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.159,auto,0.0,51.41,1656.67 -256,32,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.575,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.008,83.39,2648.15 -256,32,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,16.2461,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0091,103.27,3267.5 -256,32,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,18.8341,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.01,111.35,3518.79 -256,32,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,53.5449,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0244,156.66,4932.5 -256,64,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,168,1,7.2458,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,57.89,1006.22 -256,64,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.6051,auto,0.0,97.48,1618.39 -256,64,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,13.7745,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0079,152.25,2456.2 -256,64,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,17.7938,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0093,188.57,3020.13 -256,64,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,20.3857,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.01,205.75,3287.14 -256,64,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,62.1629,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0134,269.89,4280.3 -256,128,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.8391,auto,0.0,122.66,1173.86 -256,128,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,10.461,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,160.38,1409.58 -256,128,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,20.7104,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,202.52,1685.04 -256,128,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,31.7222,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,211.55,1735.39 -256,128,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,38.2094,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,219.54,1792.36 -256,128,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,84,8,96.9584,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0251,346.07,2784.79 -256,256,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,8.4883,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,197.65,1119.51 -256,256,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,12.1986,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,275.07,1343.1 -256,256,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,23.8446,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,351.8,1552.88 -256,256,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,35.5373,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,377.68,1622.85 -256,256,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,43.5663,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,385.1,1639.67 -256,256,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,26,4,153.6203,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0168,436.85,1808.83 -256,512,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,10.3599,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,323.89,1201.93 -256,512,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,16.5475,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,405.55,1188.14 -256,512,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,35.1105,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,477.84,1175.94 -256,512,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,54.7178,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,490.58,1149.8 -256,512,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,67.6879,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,495.72,1142.49 -256,512,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,245.2234,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,547.33,1197.28 -256,1024,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,15.3486,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,437.23,1195.55 -256,1024,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,26.5392,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,505.73,987.76 -256,1024,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,58.5966,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,572.63,850.0 -256,1024,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,89.5148,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,599.76,819.98 -256,1024,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,107.3896,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,624.91,829.96 -256,1024,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,389.4198,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,689.32,834.73 -256,2048,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,22.9328,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,585.27,1314.56 -256,2048,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,35.9027,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,747.67,1095.23 -256,2048,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,70.0071,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,958.6,954.86 -256,2048,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,108.3905,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,990.62,870.67 -256,2048,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,126.7131,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1059.23,889.58 -256,2048,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,456.1972,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1176.84,850.45 -256,4096,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,43.3636,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,619.03,1239.28 -256,4096,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,69.4435,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,773.1,943.73 -256,4096,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,134.616,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,997.04,749.73 -256,4096,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,206.706,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1038.91,659.46 -256,4096,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,244.7736,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1096.67,653.29 -256,4096,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,968.3405,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1108.85,530.6 -256,8192,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,67.9081,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,790.58,1486.21 -256,8192,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,107.7181,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,996.81,1095.13 -256,8192,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,214.2474,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1252.92,789.19 -256,8192,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,326.4603,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1315.62,674.51 -256,8192,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,392.5128,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1367.78,647.83 -256,8192,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1518.2312,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1414.46,504.18 -256,16384,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,114.5126,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,937.66,1705.47 -256,16384,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,181.7006,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1181.88,1226.32 -256,16384,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,385.0646,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1394.24,793.11 -256,16384,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,587.1472,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1463.0,660.78 -256,16384,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,715.1868,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1501.34,619.45 -256,16384,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2808.6562,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1529.19,451.74 -256,32768,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,222.8413,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,963.68,1723.38 -256,32768,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,358.6914,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1197.4,1205.88 -256,32768,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,758.3608,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1415.87,762.21 -256,32768,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1171.8545,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1466.04,617.41 -256,32768,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1431.8652,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1499.78,573.04 -256,32768,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5625.3694,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1527.0,404.49 -256,1,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,13.997,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0184,4.68,4683.79 -256,16,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,15.9482,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0121,65.75,4132.42 -256,32,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,18.5129,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0122,113.28,3579.84 -256,64,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,22.5849,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0052,185.71,2967.05 -256,128,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,32.7099,auto,0.0,256.45,2093.71 -256,256,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,46.8385,auto,0.0,358.19,1525.12 -256,512,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,68.9628,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,486.56,1121.37 -256,1024,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,103.7585,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,646.78,859.0 -256,2048,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,124.2504,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1080.22,907.22 -256,4096,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,227.3241,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1180.85,703.44 -256,8192,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,437.9952,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1225.75,580.55 -256,16384,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,799.8221,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1342.48,553.9 -256,32768,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1604.0758,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1338.77,511.52 -256,1,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,25.0392,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0127,5.23,5236.1 -256,16,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,27.453,auto,0.0,76.39,4795.3 -256,32,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,29.9037,auto,0.0,140.26,4421.49 -256,64,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,37.0414,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,226.47,3600.45 -256,128,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,51.7913,auto,0.0,323.94,2619.35 -256,256,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,72.663,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,461.78,1930.1 -256,512,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,103.9297,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,645.71,1437.72 -256,1024,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,115.6173,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1160.88,1451.1 -256,2048,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,218.7234,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1227.28,934.84 -256,4096,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,419.7581,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1279.0,661.98 -256,8192,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,757.9903,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1416.56,560.26 -256,16384,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1489.2465,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1441.99,482.31 -256,32768,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2902.235,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1479.88,449.82 -256,1,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,110,1,87.0947,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,6.02,6021.04 -256,16,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,110,1,90.7763,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,92.41,5795.46 -256,32,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,110,1,94.9692,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,176.66,5558.57 -256,64,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,235,1,107.7019,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,311.55,4934.89 -256,128,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,123.2362,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,544.55,4371.33 -256,256,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,138.7156,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,967.57,3987.47 -256,512,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,229.1527,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1171.43,2539.62 -256,1024,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,442.0268,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1214.57,1447.04 -256,2048,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,795.0722,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1350.5,949.57 -256,4096,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1463.0452,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1467.82,673.71 -256,8192,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2913.1928,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1474.32,496.72 -256,16384,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5818.0865,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1476.42,407.31 -256,32768,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,11574.4717,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1484.29,364.19 +gfx,cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +gfx950,256,1,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,24.2594,auto,0.0,0.04,42.64 +gfx950,256,16,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,18.9179,auto,0.0,0.87,62.96 +gfx950,256,32,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,20.665,auto,0.0,1.59,65.72 +gfx950,256,64,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.9084,auto,0.0,2.99,77.24 +gfx950,256,128,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,31.8414,auto,0.0,4.12,74.13 +gfx950,256,256,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,49.0384,auto,0.0,5.35,75.38 +gfx950,256,512,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,50.1618,auto,0.0,10.45,126.97 +gfx950,256,1024,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,132.7156,auto,0.0,7.9,88.27 +gfx950,256,2048,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,131.8115,auto,0.0,15.91,169.98 +gfx950,256,4096,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,101.6103,auto,0.0,41.28,430.92 +gfx950,256,8192,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,101.715,auto,0.0,82.47,850.89 +gfx950,256,16384,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,103.0716,auto,0.0,162.77,1669.45 +gfx950,256,32768,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,110.6268,auto,0.0,303.31,3101.62 +gfx950,256,1,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,24.9311,auto,0.0,0.08,82.57 +gfx950,256,16,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,19.4155,auto,0.0,1.69,114.25 +gfx950,256,32,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.489,auto,0.0,3.05,111.15 +gfx950,256,64,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,22.9195,auto,0.0,5.72,119.07 +gfx950,256,128,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,33.6484,auto,0.0,7.79,101.34 +gfx950,256,256,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,48.3964,auto,0.0,10.83,98.6 +gfx950,256,512,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,46.1295,auto,0.0,22.73,162.49 +gfx950,256,1024,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,153.9655,auto,0.0,13.62,84.07 +gfx950,256,2048,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,152.8232,auto,0.0,27.45,155.99 +gfx950,256,4096,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,106.0943,auto,0.0,79.07,430.08 +gfx950,256,8192,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,107.2911,auto,0.0,156.37,831.48 +gfx950,256,16384,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,109.2356,auto,0.0,307.17,1614.62 +gfx950,256,32768,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,120.4976,auto,0.0,556.93,2910.43 +gfx950,256,1,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.9988,auto,0.0,0.34,341.84 +gfx950,256,16,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,19.3848,auto,0.0,6.76,432.37 +gfx950,256,32,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.7693,auto,0.0,12.04,393.71 +gfx950,256,64,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.8795,auto,0.0,21.96,374.79 +gfx950,256,128,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,33.711,auto,0.0,31.1,287.96 +gfx950,256,256,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,47.7622,auto,0.0,43.91,234.98 +gfx950,256,512,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.1338,auto,0.0,92.93,315.82 +gfx950,256,1024,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,169.7397,auto,0.0,49.42,119.69 +gfx950,256,2048,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,170.1752,auto,0.0,98.59,190.63 +gfx950,256,4096,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,111.1677,auto,0.0,301.84,509.94 +gfx950,256,8192,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,117.7072,auto,0.0,570.13,893.62 +gfx950,256,16384,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,227.3613,auto,0.0,590.33,889.24 +gfx950,256,32768,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,433.4434,auto,0.0,619.31,914.0 +gfx950,256,1,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,7.0164,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0012,0.93,935.68 +gfx950,256,1,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,129,4,7.8429,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.007,1.67,1672.85 +gfx950,256,1,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.5446,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.008,2.84,2839.82 +gfx950,256,1,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,232,16,12.6295,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0311,4.15,4152.92 +gfx950,256,1,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,17.8122,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0098,3.68,3680.57 +gfx950,256,1,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,45.3626,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0342,5.78,5780.21 +gfx950,256,16,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.9722,auto,0.0,17.56,1128.21 +gfx950,256,16,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.4685,auto,0.0,28.08,1782.42 +gfx950,256,16,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.4765,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.008,42.02,2647.72 +gfx950,256,16,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6344,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0207,61.53,3869.37 +gfx950,256,16,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,18.6276,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0098,56.29,3538.01 +gfx950,256,16,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,49.8337,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0247,84.17,5280.1 +gfx950,256,32,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,170,1,7.3371,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,28.58,943.46 +gfx950,256,32,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.159,auto,0.0,51.41,1656.67 +gfx950,256,32,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.575,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.008,83.39,2648.15 +gfx950,256,32,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,16.2461,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0091,103.27,3267.5 +gfx950,256,32,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,18.8341,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.01,111.35,3518.79 +gfx950,256,32,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,53.5449,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0244,156.66,4932.5 +gfx950,256,64,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,168,1,7.2458,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,57.89,1006.22 +gfx950,256,64,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.6051,auto,0.0,97.48,1618.39 +gfx950,256,64,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,13.7745,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0079,152.25,2456.2 +gfx950,256,64,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,17.7938,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0093,188.57,3020.13 +gfx950,256,64,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,20.3857,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.01,205.75,3287.14 +gfx950,256,64,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,62.1629,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0134,269.89,4280.3 +gfx950,256,128,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.8391,auto,0.0,122.66,1173.86 +gfx950,256,128,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,10.461,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,160.38,1409.58 +gfx950,256,128,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,20.7104,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,202.52,1685.04 +gfx950,256,128,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,31.7222,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,211.55,1735.39 +gfx950,256,128,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,38.2094,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,219.54,1792.36 +gfx950,256,128,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,84,8,96.9584,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0251,346.07,2784.79 +gfx950,256,256,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,8.4883,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,197.65,1119.51 +gfx950,256,256,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,12.1986,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,275.07,1343.1 +gfx950,256,256,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,23.8446,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,351.8,1552.88 +gfx950,256,256,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,35.5373,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,377.68,1622.85 +gfx950,256,256,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,43.5663,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,385.1,1639.67 +gfx950,256,256,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,26,4,153.6203,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0168,436.85,1808.83 +gfx950,256,512,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,10.3599,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,323.89,1201.93 +gfx950,256,512,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,16.5475,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,405.55,1188.14 +gfx950,256,512,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,35.1105,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,477.84,1175.94 +gfx950,256,512,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,54.7178,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,490.58,1149.8 +gfx950,256,512,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,67.6879,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,495.72,1142.49 +gfx950,256,512,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,245.2234,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,547.33,1197.28 +gfx950,256,1024,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,15.3486,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,437.23,1195.55 +gfx950,256,1024,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,26.5392,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,505.73,987.76 +gfx950,256,1024,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,58.5966,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,572.63,850.0 +gfx950,256,1024,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,89.5148,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,599.76,819.98 +gfx950,256,1024,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,107.3896,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,624.91,829.96 +gfx950,256,1024,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,389.4198,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,689.32,834.73 +gfx950,256,2048,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,22.9328,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,585.27,1314.56 +gfx950,256,2048,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,35.9027,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,747.67,1095.23 +gfx950,256,2048,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,70.0071,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,958.6,954.86 +gfx950,256,2048,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,108.3905,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,990.62,870.67 +gfx950,256,2048,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,126.7131,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1059.23,889.58 +gfx950,256,2048,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,456.1972,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1176.84,850.45 +gfx950,256,4096,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,43.3636,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,619.03,1239.28 +gfx950,256,4096,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,69.4435,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,773.1,943.73 +gfx950,256,4096,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,134.616,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,997.04,749.73 +gfx950,256,4096,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,206.706,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1038.91,659.46 +gfx950,256,4096,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,244.7736,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1096.67,653.29 +gfx950,256,4096,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,968.3405,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1108.85,530.6 +gfx950,256,8192,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,67.9081,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,790.58,1486.21 +gfx950,256,8192,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,107.7181,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,996.81,1095.13 +gfx950,256,8192,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,214.2474,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1252.92,789.19 +gfx950,256,8192,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,326.4603,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1315.62,674.51 +gfx950,256,8192,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,392.5128,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1367.78,647.83 +gfx950,256,8192,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1518.2312,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1414.46,504.18 +gfx950,256,16384,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,114.5126,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,937.66,1705.47 +gfx950,256,16384,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,181.7006,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1181.88,1226.32 +gfx950,256,16384,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,385.0646,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1394.24,793.11 +gfx950,256,16384,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,587.1472,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1463.0,660.78 +gfx950,256,16384,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,715.1868,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1501.34,619.45 +gfx950,256,16384,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2808.6562,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1529.19,451.74 +gfx950,256,32768,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,222.8413,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,963.68,1723.38 +gfx950,256,32768,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,358.6914,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1197.4,1205.88 +gfx950,256,32768,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,758.3608,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1415.87,762.21 +gfx950,256,32768,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1171.8545,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1466.04,617.41 +gfx950,256,32768,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1431.8652,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1499.78,573.04 +gfx950,256,32768,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5625.3694,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1527.0,404.49 +gfx950,256,1,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,13.997,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0184,4.68,4683.79 +gfx950,256,16,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,15.9482,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0121,65.75,4132.42 +gfx950,256,32,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,18.5129,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0122,113.28,3579.84 +gfx950,256,64,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,22.5849,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0052,185.71,2967.05 +gfx950,256,128,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,32.7099,auto,0.0,256.45,2093.71 +gfx950,256,256,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,46.8385,auto,0.0,358.19,1525.12 +gfx950,256,512,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,68.9628,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,486.56,1121.37 +gfx950,256,1024,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,103.7585,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,646.78,859.0 +gfx950,256,2048,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,124.2504,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1080.22,907.22 +gfx950,256,4096,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,227.3241,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1180.85,703.44 +gfx950,256,8192,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,437.9952,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1225.75,580.55 +gfx950,256,16384,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,799.8221,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1342.48,553.9 +gfx950,256,32768,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1604.0758,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1338.77,511.52 +gfx950,256,1,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,25.0392,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0127,5.23,5236.1 +gfx950,256,16,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,27.453,auto,0.0,76.39,4795.3 +gfx950,256,32,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,29.9037,auto,0.0,140.26,4421.49 +gfx950,256,64,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,37.0414,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,226.47,3600.45 +gfx950,256,128,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,51.7913,auto,0.0,323.94,2619.35 +gfx950,256,256,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,72.663,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,461.78,1930.1 +gfx950,256,512,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,103.9297,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,645.71,1437.72 +gfx950,256,1024,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,115.6173,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1160.88,1451.1 +gfx950,256,2048,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,218.7234,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1227.28,934.84 +gfx950,256,4096,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,419.7581,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1279.0,661.98 +gfx950,256,8192,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,757.9903,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1416.56,560.26 +gfx950,256,16384,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1489.2465,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1441.99,482.31 +gfx950,256,32768,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2902.235,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1479.88,449.82 +gfx950,256,1,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,110,1,87.0947,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,6.02,6021.04 +gfx950,256,16,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,110,1,90.7763,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,92.41,5795.46 +gfx950,256,32,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,110,1,94.9692,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,176.66,5558.57 +gfx950,256,64,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,235,1,107.7019,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,311.55,4934.89 +gfx950,256,128,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,123.2362,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,544.55,4371.33 +gfx950,256,256,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,138.7156,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,967.57,3987.47 +gfx950,256,512,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,229.1527,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1171.43,2539.62 +gfx950,256,1024,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,442.0268,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1214.57,1447.04 +gfx950,256,2048,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,795.0722,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1350.5,949.57 +gfx950,256,4096,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1463.0452,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1467.82,673.71 +gfx950,256,8192,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2913.1928,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1474.32,496.72 +gfx950,256,16384,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5818.0865,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1476.42,407.31 +gfx950,256,32768,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,11574.4717,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1484.29,364.19 diff --git a/aiter/jit/core.py b/aiter/jit/core.py index 27f1a2966b..6310a6dadd 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -228,6 +228,8 @@ def update_config_files(self, file_path: str, merge_name: str): keys = untunedf.columns.to_list() if "cu_num" not in keys: keys.append("cu_num") + if "gfx" in merge_df.columns and "gfx" not in keys: + keys.append("gfx") dedup_keys = keys + ["_tag"] if has_tag else keys duplicated_mask = merge_df.duplicated(subset=dedup_keys, keep=False) if duplicated_mask.any(): @@ -1594,7 +1596,7 @@ def canonicalize_hint(hint): ) return True - # develop=True: torch.Tensor -> pybind aiter_tensor_t before C++ (activation, CAR, …). + # develop=True: torch.Tensor -> pybind aiter_tensor_t before C++ (activation, CAR, ...). if develop: import torch diff --git a/aiter/jit/utils/build_targets.py b/aiter/jit/utils/build_targets.py new file mode 100644 index 0000000000..86580f1f63 --- /dev/null +++ b/aiter/jit/utils/build_targets.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +# +# Pure-Python arch constants and env-driven build target resolution. +# No torch dependency — safe to import in build scripts, gen_instances, and tests +# that run without a GPU or a full PyTorch install. +import os + +GFX_MAP = { + 0: "native", + 1: "gfx90a", + 2: "gfx908", + 3: "gfx940", + 4: "gfx941", + 5: "gfx942", + 6: "gfx945", + 7: "gfx1100", + 8: "gfx950", + 9: "gfx1101", + 10: "gfx1102", + 11: "gfx1103", + 12: "gfx1150", + 13: "gfx1151", + 14: "gfx1152", + 15: "gfx1153", + 16: "gfx1200", + 17: "gfx1201", + 18: "gfx1250", +} + +# Maps gfx arch to the default (SPX / full-GPU) CU count used when no live GPU is +# present at build time (e.g. CI nodes with GPU_ARCHS set but no device visible). +# For live GPU builds, get_cu_num() is used instead and correctly reflects the +# actual visible CU count, including non-SPX partition modes (DPX / QPX / CPX) +# and binned variants (e.g. MI308X is gfx942 but has fewer CUs than MI300X). +# If building without a GPU for a binned or partitioned target, set CU_NUM +# explicitly alongside GPU_ARCHS to override the default here. +# Extend this table when adding support for new GPU targets. +GFX_CU_NUM_MAP = { + "gfx942": 304, # MI300X (SPX, full GPU); MI308X shares gfx942 — use CU_NUM override + "gfx950": 256, # MI350 +} + + +def _parse_gpu_archs_env(gfx_env: str) -> list[str]: + """Split a GPU_ARCHS string into a list of non-empty architecture names. + + Raises RuntimeError if no valid architecture names remain after splitting + on ';' and stripping whitespace — e.g. GPU_ARCHS=" ; " would otherwise + silently produce an empty target list and fall back to heuristic kernels. + """ + archs = [g.strip() for g in gfx_env.split(";") if g.strip()] + if not archs: + raise RuntimeError( + f"GPU_ARCHS={gfx_env!r} contains no valid architecture names after splitting on ';'. " + f"Known targets: {list(GFX_CU_NUM_MAP.keys())}" + ) + return archs + + +def get_build_targets_env() -> list[tuple[str, int]]: + """Resolve build targets from GPU_ARCHS env var only. No live GPU detection. + + Raises RuntimeError if GPU_ARCHS is not set or contains an unknown arch. + Intended for CI nodes, build scripts, and tests that run without a GPU. + Use chip_info.get_build_targets() when live GPU fallback is also desired. + """ + gfx_env = os.getenv("GPU_ARCHS") + if not gfx_env: + raise RuntimeError( + "GPU_ARCHS is not set. " + "Set GPU_ARCHS=gfx942 (or similar) to resolve build targets without a GPU." + ) + targets = [] + for gfx in _parse_gpu_archs_env(gfx_env): + if gfx not in GFX_CU_NUM_MAP: + raise RuntimeError( + f"Unknown gfx '{gfx}' in GPU_ARCHS — add it to " + f"GFX_CU_NUM_MAP in build_targets.py. Known targets: " + f"{list(GFX_CU_NUM_MAP.keys())}" + ) + cu_num = int(os.getenv("CU_NUM", GFX_CU_NUM_MAP[gfx])) + targets.append((gfx, cu_num)) + return targets + + +def filter_tune_df(tune_df, targets: list): + """Return the subset of tune_df whose (gfx, cu_num) matches any entry in targets. + + Args: + tune_df: pandas DataFrame loaded from a tuning CSV (must have 'gfx' and + 'cu_num' columns). + targets: list of (gfx, cu_num) tuples, as returned by get_build_targets() + or get_build_targets_env(). + + Returns: + Filtered DataFrame (original index preserved, no reset). + """ + import pandas as pd + + mask = pd.Series([False] * len(tune_df), index=tune_df.index) + for gfx, cu_num in targets: + mask |= (tune_df["gfx"] == gfx) & (tune_df["cu_num"] == cu_num) + return tune_df[mask] diff --git a/aiter/jit/utils/chip_info.py b/aiter/jit/utils/chip_info.py index 36699fcead..40cc37f6db 100644 --- a/aiter/jit/utils/chip_info.py +++ b/aiter/jit/utils/chip_info.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. import functools +import logging import os import re import subprocess @@ -8,27 +9,14 @@ from cpp_extension import executable_path from torch_guard import torch_compile_guard -GFX_MAP = { - 0: "native", - 1: "gfx90a", - 2: "gfx908", - 3: "gfx940", - 4: "gfx941", - 5: "gfx942", - 6: "gfx945", - 7: "gfx1100", - 8: "gfx950", - 9: "gfx1101", - 10: "gfx1102", - 11: "gfx1103", - 12: "gfx1150", - 13: "gfx1151", - 14: "gfx1152", - 15: "gfx1153", - 16: "gfx1200", - 17: "gfx1201", - 18: "gfx1250", -} +from build_targets import ( # noqa: F401 — re-exported for callers + GFX_MAP, + _parse_gpu_archs_env, + filter_tune_df, + get_build_targets_env, +) + +logger = logging.getLogger("aiter") @functools.lru_cache(maxsize=1) @@ -43,8 +31,9 @@ def _detect_native() -> list[str]: check=True, ) for line in result.stdout.splitlines(): - if "gfx" in line.lower(): - return [line.split(":", 1)[-1].strip()] + match = re.search(r"\b(gfx\w+)\b", line, re.IGNORECASE) + if match: + return [match.group(1).lower()] except Exception as e: raise RuntimeError(f"Get GPU arch from rocminfo failed: {e}") from e raise RuntimeError("No gfx arch found in rocminfo output.") @@ -59,29 +48,12 @@ def get_gfx_custom_op() -> int: def get_gfx_custom_op_core() -> int: gfx = os.getenv("GPU_ARCHS", "native") gfx_mapping = {v: k for k, v in GFX_MAP.items()} - # gfx = os.getenv("GPU_ARCHS", "native") if gfx == "native": - try: - rocminfo = executable_path("rocminfo") - result = subprocess.run( - [rocminfo], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True - ) - output = result.stdout - for line in output.split("\n"): - match = re.search(r"\b(gfx\w+)\b", line, re.IGNORECASE) - if match: - gfx_arch = match.group(1).lower() - try: - return gfx_mapping[gfx_arch] - except KeyError: - raise KeyError( - f"Unknown GPU architecture: {gfx_arch}. " - f"Supported architectures: {list(gfx_mapping.keys())}" - ) - - except Exception as e: - raise RuntimeError(f"Get GPU arch from rocminfo failed {str(e)}") + gfx = _detect_native()[0] elif ";" in gfx: + # TODO: multi-arch GPU_ARCHS (e.g. "gfx942;gfx950") — picking the + # last entry is a known limitation for build-time codegen callers. + # For runtime dispatch, prefer get_gfx_runtime(). gfx = gfx.split(";")[-1] try: return gfx_mapping[gfx] @@ -98,6 +70,25 @@ def get_gfx(): return GFX_MAP.get(gfx_num, "unknown") +@functools.lru_cache(maxsize=1) +def get_gfx_runtime() -> str: + """Return the arch of the live GPU, always via rocminfo. + + Unlike get_gfx(), ignores GPU_ARCHS — always detects the actual running + GPU. Use for runtime dispatch decisions (selecting tuned kernels, picking + code paths). Use get_gfx() for build-time codegen paths (gen_instances, + csrc module-level arch selection) where no GPU may be available. + """ + gfx_arch = _detect_native()[0] + supported = set(GFX_MAP.values()) + if gfx_arch not in supported: + raise KeyError( + f"Unknown GPU architecture: {gfx_arch}. " + f"Supported architectures: {sorted(supported)}" + ) + return gfx_arch + + @functools.lru_cache(maxsize=1) def get_gfx_list() -> list[str]: @@ -108,7 +99,7 @@ def get_gfx_list() -> list[str]: except RuntimeError: gfxs = ["cpu"] else: - gfxs = [g.strip() for g in gfx_env.split(";") if g.strip()] + gfxs = _parse_gpu_archs_env(gfx_env) os.environ["AITER_GPU_ARCHS"] = ";".join(gfxs) return gfxs @@ -146,6 +137,200 @@ def get_cu_num(): return cu_num +def get_build_targets() -> list[tuple[str, int]]: + """Return (gfx, cu_num) pairs to compile kernels for. + + Used by gen_instances.py in all CK GEMM modules to filter the tuning CSV + to exactly the right set of kernels for the target GPU(s). + + Priority: + 1. GPU_ARCHS set to an explicit non-empty target list → delegate to + get_build_targets_env() (no GPU needed). + 2. GPU_ARCHS unset, empty/whitespace, or "native" → call get_gfx() + (GPU_ARCHS-aware; falls back to rocminfo when GPU_ARCHS is unset) and + get_cu_num(), which correctly reflect partition mode and binned variants. + 3. Neither → raise RuntimeError with a clear message. + """ + gpu_archs = os.getenv("GPU_ARCHS") + gpu_archs_normalized = gpu_archs.strip() if gpu_archs is not None else "" + if gpu_archs_normalized and gpu_archs_normalized.lower() != "native": + return get_build_targets_env() + + try: + # get_gfx() is intentional here — this is a build-time path; get_gfx_runtime() + # would fail in CI environments without a live GPU. + return [(get_gfx(), get_cu_num())] + except Exception as e: + raise RuntimeError( + "No GPU detected and GPU_ARCHS is not set to an explicit target. " + "Set GPU_ARCHS=gfx942 (or similar) to build without a GPU." + ) from e + + +def build_tune_dict( + tune_df, default_dict, kernels_list, libtype=None, kernels_by_name=None +): + """Filter tune_df to rows matching the current build targets and return a + (gfx, cu_num, M, N, K)-keyed dispatch dict, starting from a copy of default_dict. + + Replaces the duplicated get_tune_dict filtering loop in each gen_instances.py. + Modules keep their own default_dict and kernels_list; only the CSV filtering + and key construction are shared here. + + Args: + tune_df: pandas DataFrame already loaded from the tuning CSV. + default_dict: module-level fallback dict (negative-int keys) to start from. + kernels_list: module-level dict mapping kernelId → kernelInstance. + libtype: Optional string to filter the "libtype" column (e.g. "ck"). + Required for CSVs that mix multiple library types (e.g. + a8w8_bpreshuffle_tuned_gemm.csv mixes "ck" and "cktile"). + If None, no libtype filtering is applied. + kernels_by_name: Optional dict mapping kernelName string → kernelInstance. + When provided and the CSV has a "kernelName" column, kernel + lookup uses the name instead of kernelId. If the name is not + found in kernels_by_name, the entry is skipped (heuristic + default used) and a warning is logged — no kernelId fallback + is attempted, because kernelIds are not stable across kernel + list reorderings. Falls back to kernelId if the kernelName + column is absent from the CSV. + + Returns: + dict with mixed keys: negative ints (from default_dict) and + (gfx, cu_num, M, N, K) 5-tuples (from the filtered CSV rows). + """ + tune_dict = dict(default_dict) + targets = get_build_targets() + filtered = filter_tune_df(tune_df, targets) + if libtype is not None and "libtype" in tune_df.columns: + filtered = filtered[filtered["libtype"] == libtype] + use_name = kernels_by_name is not None and "kernelName" in tune_df.columns + if kernels_by_name is not None and not use_name: + logger.warning( + "kernels_by_name provided but CSV has no kernelName column, falling back to kernelId." + ) + for _, row in filtered.iterrows(): + key = ( + str(row["gfx"]), + int(row["cu_num"]), + int(row["M"]), + int(row["N"]), + int(row["K"]), + ) + if use_name: + kname = str(row["kernelName"]) + kernel = kernels_by_name.get(kname) + if kernel is not None: + tune_dict[key] = kernel + else: + logger.warning( + f"kernelName '{kname}' not found in kernels_by_name " + f"(gfx={key[0]}, cu_num={key[1]}, M={key[2]}, N={key[3]}, K={key[4]}); " + f"falling back to heuristic default." + ) + else: + kid = int(row["kernelId"]) + kernel = kernels_list.get(kid) + if kernel is not None: + tune_dict[key] = kernel + else: + logger.warning( + f"kernelId {kid} not in kernels_list " + f"(gfx={key[0]}, cu_num={key[1]}, M={key[2]}, N={key[3]}, K={key[4]}, " + f"kernels_list size={len(kernels_list)}); falling back to heuristic default." + ) + return tune_dict + + +def build_tune_dict_batched(tune_df, default_dict, kernels_list, libtype=None): + """Like build_tune_dict, but for batched GEMM modules whose dispatch key + includes the batch dimension B. + + Builds a (gfx, cu_num, B, M, N, K) 6-tuple keyed dict suitable for use with + BatchedGemmDispatchMap in the C++ dispatch layer. + + Args: + tune_df: pandas DataFrame loaded from the batched tuning CSV. + default_dict: module-level fallback dict (negative-int keys) to start from. + kernels_list: module-level dict mapping kernelId → kernelInstance. + libtype: Optional string to filter the "libtype" column (same semantics + as build_tune_dict). + + Returns: + dict with mixed keys: negative ints (from default_dict) and + (gfx, cu_num, B, M, N, K) 6-tuples (from the filtered CSV rows). + """ + tune_dict = dict(default_dict) + targets = get_build_targets() + filtered = filter_tune_df(tune_df, targets) + if libtype is not None and "libtype" in tune_df.columns: + filtered = filtered[filtered["libtype"] == libtype] + for _, row in filtered.iterrows(): + key = ( + str(row["gfx"]), + int(row["cu_num"]), + int(row["B"]), + int(row["M"]), + int(row["N"]), + int(row["K"]), + ) + kid = int(row["kernelId"]) + kernel = kernels_list.get(kid) + if kernel is not None: + tune_dict[key] = kernel + else: + logger.warning( + f"kernelId {kid} not in kernels_list " + f"(gfx={key[0]}, cu_num={key[1]}, B={key[2]}, M={key[3]}, N={key[4]}, K={key[5]}, " + f"kernels_list size={len(kernels_list)}); falling back to heuristic default." + ) + return tune_dict + + +def write_lookup_header( + output_path, kernels_dict, lookup_head, lookup_template, lookup_end, istune=False +): + """Write a C++ GEMM dispatch lookup header from a kernels_dict. + + Replaces the duplicated gen_lookup_dict loop in each gen_instances.py codegen + class. Each module still defines its own lookup_head / lookup_template / + lookup_end strings (they embed the module-specific GENERATE_LOOKUP_TABLE macro + type parameters), but the iteration and key-formatting logic is shared here. + + Key layout in kernels_dict: + - Negative ints (default_dict entries) → skipped in non-tune mode. + - (gfx,cu_num,M,N,K) 5-tuples (tuned entries) → written as {"gfx",cu_num,M,N,K} C++ key. + - (gfx,cu_num,B,M,N,K) 6-tuples (batched) → written as {"gfx",cu_num,B,M,N,K} C++ key. + - Non-negative ints (tune mode only) → written as plain integer kernel ID. + + Args: + output_path: Full path of the .h file to write. + kernels_dict: Dict returned by build_tune_dict (or get_tune_dict). + lookup_head: String written before the loop (defines the macro header). + lookup_template: String with {MNK} and {kernel_name} placeholders. + lookup_end: String written after the loop (closes the macro / #endif). + istune: True when generating the tune-mode lookup (int kernelId keys). + """ + with open(output_path, "w") as f: + f.write(lookup_head) + for key, k in kernels_dict.items(): + if not istune and (isinstance(key, tuple) and isinstance(key[0], str)): + # 5-tuple key: (gfx, cu_num, M, N, K) + # 6-tuple key: (gfx, cu_num, B, M, N, K) + # key[0] is the gfx arch string; the remaining elements are ints. + cpp_key = ( + '{"' + key[0] + '", ' + ", ".join(str(x) for x in key[1:]) + "}" + ) + f.write( + lookup_template.format( + MNK=cpp_key, + kernel_name=k.name, + ) + ) + elif istune and isinstance(key, int) and key >= 0: + f.write(lookup_template.format(MNK=key, kernel_name=k.name)) + f.write(lookup_end) + + def _get_pci_chip_id(device_id=0): import ctypes diff --git a/aiter/ops/batched_gemm_op_a8w8.py b/aiter/ops/batched_gemm_op_a8w8.py index cbab06e679..9874fc99ba 100644 --- a/aiter/ops/batched_gemm_op_a8w8.py +++ b/aiter/ops/batched_gemm_op_a8w8.py @@ -8,12 +8,11 @@ import pandas as pd from ..jit.core import ( compile_ops, - AITER_ROOT_DIR, AITER_CONFIGS, AITER_LOG_TUNED_CONFIG, ) from ..utility import dtypes -from ..jit.utils.chip_info import get_cu_num +from ..jit.utils.chip_info import get_cu_num, get_gfx_runtime as get_gfx from aiter import logger @@ -73,14 +72,34 @@ def get_CKBatchedGEMM_config( ck_batched_gemm_dict = pd.read_csv( AITER_CONFIGS.AITER_CONFIG_A8W8_BATCHED_GEMM_FILE ).drop_duplicates() - - get_CKBatchedGEMM_config.ck_batched_gemm_dict = ck_batched_gemm_dict.set_index( - ["cu_num", "B", "M", "N", "K"] - ).to_dict("index") + # Use (gfx, cu_num, B, M, N, K) key when the CSV has a gfx column (new schema). + # Fall back to (cu_num, B, M, N, K) for old CSVs that pre-date the gfx column. + if "gfx" in ck_batched_gemm_dict.columns: + get_CKBatchedGEMM_config.ck_batched_gemm_dict = ( + ck_batched_gemm_dict.set_index( + ["gfx", "cu_num", "B", "M", "N", "K"] + ).to_dict("index") + ) + get_CKBatchedGEMM_config.has_gfx = True + else: + logger.warning( + f"{AITER_CONFIGS.AITER_CONFIG_A8W8_BATCHED_GEMM_FILE} has no 'gfx' column — " + "falling back to cu_num-only key. Re-run the tuner or migrate the CSV." + ) + get_CKBatchedGEMM_config.ck_batched_gemm_dict = ( + ck_batched_gemm_dict.set_index(["cu_num", "B", "M", "N", "K"]).to_dict( + "index" + ) + ) + get_CKBatchedGEMM_config.has_gfx = False + gfx = get_gfx() cu_num = get_cu_num() - config = get_CKBatchedGEMM_config.ck_batched_gemm_dict.get( - (cu_num, B, M, N, K), None + key = ( + (gfx, cu_num, B, M, N, K) + if get_CKBatchedGEMM_config.has_gfx + else (cu_num, B, M, N, K) ) + config = get_CKBatchedGEMM_config.ck_batched_gemm_dict.get(key, None) if config is not None: if AITER_LOG_TUNED_CONFIG: logger.info( diff --git a/aiter/ops/batched_gemm_op_bf16.py b/aiter/ops/batched_gemm_op_bf16.py index 43625fa429..8fed3bf858 100644 --- a/aiter/ops/batched_gemm_op_bf16.py +++ b/aiter/ops/batched_gemm_op_bf16.py @@ -8,12 +8,11 @@ import pandas as pd from ..jit.core import ( compile_ops, - AITER_ROOT_DIR, AITER_CONFIGS, AITER_LOG_TUNED_CONFIG, ) from ..utility import dtypes -from ..jit.utils.chip_info import get_cu_num +from ..jit.utils.chip_info import get_cu_num, get_gfx_runtime as get_gfx from aiter import logger @@ -58,13 +57,34 @@ def get_CKBatchedGEMM_config( ck_batched_gemm_dict = pd.read_csv( AITER_CONFIGS.AITER_CONFIG_BF16_BATCHED_GEMM_FILE ).drop_duplicates() - get_CKBatchedGEMM_config.ck_batched_gemm_dict = ck_batched_gemm_dict.set_index( - ["cu_num", "B", "M", "N", "K"] - ).to_dict("index") + # Use (gfx, cu_num, B, M, N, K) key when the CSV has a gfx column (new schema). + # Fall back to (cu_num, B, M, N, K) for old CSVs that pre-date the gfx column. + if "gfx" in ck_batched_gemm_dict.columns: + get_CKBatchedGEMM_config.ck_batched_gemm_dict = ( + ck_batched_gemm_dict.set_index( + ["gfx", "cu_num", "B", "M", "N", "K"] + ).to_dict("index") + ) + get_CKBatchedGEMM_config.has_gfx = True + else: + logger.warning( + f"{AITER_CONFIGS.AITER_CONFIG_BF16_BATCHED_GEMM_FILE} has no 'gfx' column — " + "falling back to cu_num-only key. Re-run the tuner or migrate the CSV." + ) + get_CKBatchedGEMM_config.ck_batched_gemm_dict = ( + ck_batched_gemm_dict.set_index(["cu_num", "B", "M", "N", "K"]).to_dict( + "index" + ) + ) + get_CKBatchedGEMM_config.has_gfx = False + gfx = get_gfx() cu_num = get_cu_num() - config = get_CKBatchedGEMM_config.ck_batched_gemm_dict.get( - (cu_num, B, M, N, K), None + key = ( + (gfx, cu_num, B, M, N, K) + if get_CKBatchedGEMM_config.has_gfx + else (cu_num, B, M, N, K) ) + config = get_CKBatchedGEMM_config.ck_batched_gemm_dict.get(key, None) if config is not None: if AITER_LOG_TUNED_CONFIG: logger.info( diff --git a/aiter/ops/gemm_op_a4w4.py b/aiter/ops/gemm_op_a4w4.py index 95cf5a99df..b63ff90ba7 100644 --- a/aiter/ops/gemm_op_a4w4.py +++ b/aiter/ops/gemm_op_a4w4.py @@ -11,7 +11,7 @@ from torch import Tensor from ..jit.core import AITER_CONFIGS, AITER_LOG_TUNED_CONFIG, compile_ops -from ..jit.utils.chip_info import get_cu_num, get_gfx +from ..jit.utils.chip_info import get_cu_num, get_gfx_runtime as get_gfx from ..ops.gemm_op_common import get_padded_m from ..utility import dtypes @@ -33,16 +33,37 @@ def compute_gemm_SplitK(M: int, N: int, K: int, tile_m: int, tile_n: int, tile_k def get_GEMM_config(M: int, N: int, K: int): tuned_file = AITER_CONFIGS.AITER_CONFIG_GEMM_A4W4_FILE if not hasattr(get_GEMM_config, "gemm_dict"): - gemm_dict = pd.read_csv(tuned_file).drop_duplicates() - get_GEMM_config.gemm_dict = gemm_dict.set_index( - ["cu_num", "M", "N", "K"] - ).to_dict("index") + gemm_dict = pd.read_csv( + AITER_CONFIGS.AITER_CONFIG_GEMM_A4W4_FILE + ).drop_duplicates() + # Use (gfx, cu_num, M, N, K) key when the CSV has a gfx column (new schema). + # Fall back to (cu_num, M, N, K) for old CSVs that pre-date the gfx column. + if "gfx" in gemm_dict.columns: + get_GEMM_config.gemm_dict = gemm_dict.set_index( + ["gfx", "cu_num", "M", "N", "K"] + ).to_dict("index") + get_GEMM_config.has_gfx = True + else: + logger.warning( + f"{AITER_CONFIGS.AITER_CONFIG_GEMM_A4W4_FILE} has no 'gfx' column — " + "falling back to cu_num-only key. Re-run the tuner or migrate the CSV." + ) + get_GEMM_config.gemm_dict = gemm_dict.set_index( + ["cu_num", "M", "N", "K"] + ).to_dict("index") + get_GEMM_config.has_gfx = False + gfx = get_gfx() cu_num = get_cu_num() padded_M = M config = None for gl in [None, 0, 1]: padded_M = M if gl is None else get_padded_m(M, N, K, gl) - config = get_GEMM_config.gemm_dict.get((cu_num, padded_M, N, K), None) + key = ( + (gfx, cu_num, padded_M, N, K) + if get_GEMM_config.has_gfx + else (cu_num, padded_M, N, K) + ) + config = get_GEMM_config.gemm_dict.get(key, None) if config is not None: if AITER_LOG_TUNED_CONFIG: logger.info( diff --git a/aiter/ops/gemm_op_a8w8.py b/aiter/ops/gemm_op_a8w8.py index e48fcae86d..7b3e8cb1d3 100644 --- a/aiter/ops/gemm_op_a8w8.py +++ b/aiter/ops/gemm_op_a8w8.py @@ -13,10 +13,9 @@ from ..jit.core import ( AITER_CONFIGS, AITER_LOG_TUNED_CONFIG, - AITER_ROOT_DIR, compile_ops, ) -from ..jit.utils.chip_info import get_cu_num +from ..jit.utils.chip_info import get_cu_num, get_gfx_runtime as get_gfx from ..jit.utils.torch_guard import torch_compile_guard from ..ops.gemm_op_common import get_padded_m from ..utility import dtypes @@ -346,29 +345,42 @@ def compute_gemm_SplitK(M: int, N: int, K: int, tile_m: int, tile_n: int, tile_k return splitK -_CKGEMM_CONFIG_CACHE = None +_CKGEMM_CONFIG_CACHE: dict = {} +_CKGEMM_HAS_GFX: dict = {} @functools.lru_cache(maxsize=1024) -def get_CKGEMM_config(M: int, N: int, K: int, tuned_file="a8w8_tuned_gemm.csv"): +def get_CKGEMM_config(M: int, N: int, K: int, tuned_file=None): if tuned_file is None: - tuned_file = "a8w8_tuned_gemm.csv" - global _CKGEMM_CONFIG_CACHE - - if _CKGEMM_CONFIG_CACHE is None: - _CKGEMM_CONFIG_CACHE = {} + tuned_file = AITER_CONFIGS.AITER_CONFIG_GEMM_A8W8_FILE if tuned_file not in _CKGEMM_CONFIG_CACHE: ckgemm_dict = pd.read_csv(f"{tuned_file}").drop_duplicates() - _CKGEMM_CONFIG_CACHE[tuned_file] = ckgemm_dict.set_index( - ["cu_num", "M", "N", "K"] - ).to_dict("index") + # Use (gfx, cu_num, M, N, K) key when the CSV has a gfx column (new schema). + # Fall back to (cu_num, M, N, K) for old CSVs that pre-date the gfx column. + if "gfx" in ckgemm_dict.columns: + _CKGEMM_CONFIG_CACHE[tuned_file] = ckgemm_dict.set_index( + ["gfx", "cu_num", "M", "N", "K"] + ).to_dict("index") + _CKGEMM_HAS_GFX[tuned_file] = True + else: + logger.warning( + f"{tuned_file} has no 'gfx' column — falling back to cu_num-only key. " + "Re-run the tuner or migrate the CSV to add a gfx column." + ) + _CKGEMM_CONFIG_CACHE[tuned_file] = ckgemm_dict.set_index( + ["cu_num", "M", "N", "K"] + ).to_dict("index") + _CKGEMM_HAS_GFX[tuned_file] = False + gfx = get_gfx() cu_num = get_cu_num() + has_gfx = _CKGEMM_HAS_GFX[tuned_file] padded_M = M config = None for gl in [None, 0, 1]: padded_M = M if gl is None else get_padded_m(M, N, K, gl) - config = _CKGEMM_CONFIG_CACHE[tuned_file].get((cu_num, padded_M, N, K), None) + key = (gfx, cu_num, padded_M, N, K) if has_gfx else (cu_num, padded_M, N, K) + config = _CKGEMM_CONFIG_CACHE[tuned_file].get(key, None) if config is not None: if AITER_LOG_TUNED_CONFIG: logger.info( @@ -382,35 +394,53 @@ def get_CKGEMM_config(M: int, N: int, K: int, tuned_file="a8w8_tuned_gemm.csv"): return config +_GEMM_QUANT_TYPE_CACHE: dict = {} +_GEMM_QUANT_TYPE_HAS_GFX: dict = {} + + @functools.lru_cache(maxsize=1024) def get_GEMM_config_with_quant_type( M: int, N: int, K: int, q_dtype_w: torch.dtype, - tuned_file=f"{AITER_ROOT_DIR}/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv", + tuned_file=None, ): - # Use dict to cache configs for different files - if not hasattr(get_GEMM_config_with_quant_type, "file_cache"): - get_GEMM_config_with_quant_type.file_cache = {} - + if tuned_file is None: + tuned_file = AITER_CONFIGS.AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE # Load file if not cached - if tuned_file not in get_GEMM_config_with_quant_type.file_cache: + if tuned_file not in _GEMM_QUANT_TYPE_CACHE: asmGemmDictDf = pd.read_csv(tuned_file).drop_duplicates() - get_GEMM_config_with_quant_type.file_cache[tuned_file] = ( - asmGemmDictDf.set_index(["cu_num", "M", "N", "K", "q_dtype_w"]).to_dict( - "index" + # Use (gfx, cu_num, M, N, K, q_dtype_w) key when the CSV has a gfx column (new schema). + # Fall back to (cu_num, M, N, K, q_dtype_w) for old CSVs that pre-date the gfx column. + if "gfx" in asmGemmDictDf.columns: + _GEMM_QUANT_TYPE_CACHE[tuned_file] = asmGemmDictDf.set_index( + ["gfx", "cu_num", "M", "N", "K", "q_dtype_w"] + ).to_dict("index") + _GEMM_QUANT_TYPE_HAS_GFX[tuned_file] = True + else: + logger.warning( + f"{tuned_file} has no 'gfx' column — falling back to cu_num-only key. " + "Re-run the tuner or migrate the CSV to add a gfx column." ) - ) + _GEMM_QUANT_TYPE_CACHE[tuned_file] = asmGemmDictDf.set_index( + ["cu_num", "M", "N", "K", "q_dtype_w"] + ).to_dict("index") + _GEMM_QUANT_TYPE_HAS_GFX[tuned_file] = False + gfx = get_gfx() cu_num = get_cu_num() + has_gfx = _GEMM_QUANT_TYPE_HAS_GFX[tuned_file] padded_M = M config = None for gl in [None, 0, 1]: padded_M = M if gl is None else get_padded_m(M, N, K, gl) - config = get_GEMM_config_with_quant_type.file_cache[tuned_file].get( - (cu_num, padded_M, N, K, str(q_dtype_w)), None + key = ( + (gfx, cu_num, padded_M, N, K, str(q_dtype_w)) + if has_gfx + else (cu_num, padded_M, N, K, str(q_dtype_w)) ) + config = _GEMM_QUANT_TYPE_CACHE[tuned_file].get(key, None) if config is not None: if AITER_LOG_TUNED_CONFIG: msg = f"shape M:{M}, N:{N}, K:{K} q_dtype_w:{q_dtype_w}, found padded_M: {padded_M}, N:{N}, K:{K} is tuned, in {tuned_file}!" @@ -633,8 +663,6 @@ def gemm_a8w8_blockscale( n = WQ.shape[0] k = XQ.shape[1] Y = torch.empty(m, n, dtype=dtype, device=XQ.device) - from aiter.jit.utils.chip_info import get_gfx - if isBpreshuffled: if get_gfx() in ["gfx950"] and m >= 16 and k >= 512 and dtype == dtypes.bf16: return gfx950_a8w8_blockscale_ASM(XQ, WQ, x_scale, w_scale, Y) diff --git a/aiter/utility/base_tuner.py b/aiter/utility/base_tuner.py index 33559b9d27..ef364b1a68 100644 --- a/aiter/utility/base_tuner.py +++ b/aiter/utility/base_tuner.py @@ -14,6 +14,7 @@ from operator import itemgetter import time from aiter import dtypes +from aiter.jit.utils.chip_info import get_gfx_runtime as _chip_get_gfx INVALID_TIME = -1 @@ -329,16 +330,31 @@ def get_retune_gemm_list(self, args): if self.get_out_file(args.tune_file) == args.untune_file: # retune all shapes in tune_file self.untunedf = self.get_untuned_gemm_list(args.untune_file) - self.tunedf = self.untunedf[self.untunedf["cu_num"] != self.get_cu_num()] - self.untunedf = self.untunedf[self.untunedf["cu_num"] == self.get_cu_num()] + gfx = self.get_gfx() + cu_num = self.get_cu_num() + if "gfx" not in self.untunedf.columns: + self.untunedf["gfx"] = gfx + target_mask = (self.untunedf["gfx"] == gfx) & ( + self.untunedf["cu_num"] == cu_num + ) + self.tunedf = self.untunedf[~target_mask] + self.untunedf = self.untunedf[target_mask] self.untunedf = self.untunedf[self.keys] else: # retune shapes that are in both untune_file and tune_file untunedf = self.get_untuned_gemm_list(args.untune_file) + gfx = self.get_gfx() + cu_num = self.get_cu_num() if "cu_num" not in untunedf.columns: - untunedf["cu_num"] = self.get_cu_num() + untunedf["gfx"] = gfx + untunedf["cu_num"] = cu_num else: - untunedf = untunedf[untunedf["cu_num"] == self.get_cu_num()] + target_mask = untunedf["cu_num"] == cu_num + if "gfx" in untunedf.columns: + target_mask = target_mask & (untunedf["gfx"] == gfx) + else: + untunedf["gfx"] = gfx + untunedf = untunedf[target_mask] self.untunedf = untunedf[self.keys] self.tunedf = self.get_tuned_gemm_list(args.tune_file) @@ -402,6 +418,9 @@ def get_cu_num(self): cu_num = device_properties.multi_processor_count return cu_num + def get_gfx(self): + return _chip_get_gfx() + def post_process(self, rets, args, topk=-1, fast_mode=False): """post process, post process all results to return topk results""" rets = list(rets) @@ -1331,7 +1350,7 @@ class GemmCommonTuner(TunerCommon): def __init__( self, name, - key=["cu_num", "M", "N", "K"], + key=["gfx", "cu_num", "M", "N", "K"], resultList=[ "kernelId", "splitK", @@ -1358,6 +1377,7 @@ def pre_process(self, args): self.get_retune_gemm_list(args) else: self.untunedf = self.get_untuned_gemm_list(args.untune_file) + self.untunedf["gfx"] = self.get_gfx() self.untunedf["cu_num"] = self.get_cu_num() self.untunedf = self.untunedf[self.keys] self.tunedf = self.get_tuned_gemm_list(args.tune_file) @@ -1379,7 +1399,10 @@ def calculate(self, results, bpes=(2, 2, 2)): info, time, err_ratio = results if time == -1: return 0, 0 - cu_num, m, n, k, *rest = info[0] + if len(info[0]) >= 5: # gfx-aware key: (gfx, cu_num, m, n, k, ...) + _gfx, cu_num, m, n, k, *rest = info[0] + else: # legacy subclass key: (cu_num, m, n, k, ...) + cu_num, m, n, k, *rest = info[0] flop = m * n * k * 2 tflops = round(flop / (time * 1000000), 2) lhs_bpe, rhs_bpe, out_bpe = bpes @@ -1486,7 +1509,10 @@ def update_tflops_bw(self, file): resultdf.to_csv(file, index=False, na_rep="Null") def set_run_iters(self, input, inputdtype): - cu_num, m, n, k, *rest = input + if len(input) >= 5: # gfx-aware key: (gfx, cu_num, m, n, k, ...) + _gfx, cu_num, m, n, k, *rest = input + else: # legacy subclass key: (cu_num, m, n, k, ...) + cu_num, m, n, k, *rest = input flops = m * n * k * 2 if flops < 256 * 5120 * 256 * 2: self.num_warmup = 50 diff --git a/aiter/utility/pretune.py b/aiter/utility/pretune.py new file mode 100644 index 0000000000..da42bcdf61 --- /dev/null +++ b/aiter/utility/pretune.py @@ -0,0 +1,480 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +""" +pretune.py — run GEMM tuners for all CSV shapes on the live GPU. + +Two entry points: + +1. Via PRETUNE_MODULES during setup.py build (full build + retune + .so rebuild): + + PREBUILD_KERNELS=1 PRETUNE_MODULES=module_gemm_a8w8_blockscale_tune \ + python setup.py develop + +2. As a standalone script on an already-installed aiter (tune only, no full rebuild): + + python3 aiter/utility/pretune.py module_gemm_a8w8_blockscale_tune + python3 aiter/utility/pretune.py module_gemm_a8w8_tune,module_gemm_a8w8_blockscale_tune + python3 aiter/utility/pretune.py all + python3 aiter/utility/pretune.py --list # show available tune modules + + After tuning completes, the inference .so is rebuilt automatically. + Verify with: python3 op_tests/test_gemm_a8w8_blockscale.py + +Both modes accept a single module name, a comma-separated list, or "all". +Requires a live GPU — the GPU's architecture and cu_num are auto-detected and used to tag the tuned results. +All shapes in the merged tune CSV are (re-)tuned for the live GPU. + +Flow per module (PRETUNE_MODULES / setup.py path): + gen_instances.py --tune → build tune .so (all candidate kernels) + .py --all → benchmark on live GPU, update CSV + gen_instances.py --tune_file → rebuild inference .so (winners only) + +Flow per module (standalone / direct path): + .py --all → JIT-builds tune .so on first run, then + benchmarks all shapes and writes winners + back to the primary source CSV + gen_instances.py --tune_file → rebuild inference .so (winners only) +""" + +import json +import logging +import os +import re +import subprocess +import sys +import tempfile + +logger = logging.getLogger("aiter") + + +# --------------------------------------------------------------------------- +# Tune module script fallback table +# +# Some _tune modules share a tune script with a parent module. The parent +# tuner covers the child's kernel family via --libtype all. +# Value None means no viable tune script exists — the module is skipped with +# a warning. +# +# Background: +# gemm_a8w8_blockscale_tune.py covers cktile and standard bpreshuffle +# variants via --libtype all, but it writes to AITER_CONFIG_GEMM_A8W8_BLOCKSCALE. +# The blockscale_bpreshuffle family uses a separate CSV +# (AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE_FILE) that no existing +# .py script writes to — those modules cannot be pretuned until a dedicated +# tune script is added. +# --------------------------------------------------------------------------- +_SCRIPT_FALLBACK: dict = { + # cktile variant: covered by blockscale parent tuner (--libtype all) + "module_gemm_a8w8_blockscale_cktile_tune": "module_gemm_a8w8_blockscale_tune", + # bpreshuffle_cktile: covered by bpreshuffle parent tuner + "module_gemm_a8w8_bpreshuffle_cktile_tune": "module_gemm_a8w8_bpreshuffle_tune", + # blockscale_bpreshuffle variants: no tune script writes to the bpreshuffle CSV + "module_gemm_a8w8_blockscale_bpreshuffle_tune": None, + "module_gemm_a8w8_blockscale_bpreshuffle_cktile_tune": None, +} + +_SENTINEL = object() # distinct from None: "not in fallback table" + + +def _get_tune_script(entry: dict, csrc_dir: str): + """Derive the tune .py path from the non-pybind _tune.cu src entry.""" + AITER_CSRC_DIR = csrc_dir # noqa: N806,F841 — referenced by eval() + for src_expr in entry.get("srcs", []): + if "_tune" in src_expr and "pybind" not in src_expr: + try: + return eval(src_expr).replace(".cu", ".py") # noqa: S307 + except Exception: + pass + return None + + +def _get_config_attr(cfg: dict, tune_module_name: str): + """ + Find the AITER_CONFIGS. property name used by the inference module + that corresponds to this tune module. + + Strips _cktile_tune / _tune suffixes to derive candidate inference module + names and searches their blob_gen_cmd for AITER_CONFIGS.. + """ + candidates = [ + tune_module_name.replace("_cktile_tune", "").replace("_tune", ""), + tune_module_name.replace("_tune", ""), + ] + for inf_name in candidates: + cmd = cfg.get(inf_name, {}).get("blob_gen_cmd", "") + m = re.search(r"AITER_CONFIGS\.(\w+)", cmd) + if m: + return m.group(1) + return None + + +def _resolve(module_name: str, cfg: dict, csrc_dir: str): + """ + Return (tune_script_path, config_attr) for a tune module. + + Looks up the module's own tune script; if absent or missing on disk, + consults _SCRIPT_FALLBACK. Returns (None, config_attr) when no script + is available. + """ + entry = cfg.get(module_name, {}) + tune_script = _get_tune_script(entry, csrc_dir) + config_attr = _get_config_attr(cfg, module_name) + + if tune_script and not os.path.exists(tune_script): + tune_script = None + + if tune_script is None: + fallback_key = _SCRIPT_FALLBACK.get(module_name, _SENTINEL) + if fallback_key is _SENTINEL: + # Not in table: auto-derive parent by stripping _cktile suffix + parent = module_name.replace("_cktile_tune", "_tune") + fallback_key = parent if (parent != module_name and parent in cfg) else None + if fallback_key is not None: + fb_script = _get_tune_script(cfg.get(fallback_key, {}), csrc_dir) + if fb_script and os.path.exists(fb_script): + tune_script = fb_script + + return tune_script, config_attr + + +def _all_tune_modules(cfg: dict) -> list: + return [k for k in cfg if k.endswith("_tune")] + + +def _parse_module_list(value: str, cfg: dict) -> list[str]: + """Parse a PRETUNE_MODULES value into a list of module names. + + Expands "all" to every supported tune module (unsupported variants with no + tune script are excluded). Comma-separated values are split and stripped. + No further validation is performed — callers are responsible for checking + whether returned names are known and handling duplicates. + """ + _unsupported = {m for m, v in _SCRIPT_FALLBACK.items() if v is None} + if value.strip().lower() == "all": + return [m for m in _all_tune_modules(cfg) if m not in _unsupported] + return [m.strip() for m in value.split(",") if m.strip()] + + +def _make_untune_csv(tune_file: str, shape_keys: list) -> str: + """ + Read all paths in tune_file (colon-separated AITER multi-config format), + concatenate, extract all unique rows for shape_keys columns (absent columns + silently ignored), and write to a named temp file. + + Returns the temp file path — caller must delete it. + """ + import pandas as pd # deferred: absent during CI metadata-only phase + + paths = [p for p in tune_file.split(os.pathsep) if p] + dfs = [] + for p in paths: + if os.path.exists(p): + dfs.append(pd.read_csv(p)) + else: + logger.warning(f"[pretune] CSV not found, skipping: {p}") + + if not dfs: + raise FileNotFoundError(f"[pretune] No CSV files found for: {tune_file}") + + merged = pd.concat(dfs, ignore_index=True) + present = [k for k in shape_keys if k in merged.columns] + if not present: + raise ValueError( + f"[pretune] None of {shape_keys} found in CSV columns: " + f"{merged.columns.tolist()}" + ) + + shapes = merged[present].drop_duplicates().reset_index(drop=True) + tmp = tempfile.NamedTemporaryFile( + mode="w", suffix=".csv", prefix="aiter_pretune_", delete=False + ) + shapes.to_csv(tmp.name, index=False) + tmp.close() + logger.info(f"[pretune] {len(shapes)} unique shapes → {tmp.name}") + return tmp.name + + +def run_pretune( + module_name: str, + cfg: dict, + core, + csrc_dir: str, + repo_dir: str, + build_one_module=None, + libtype: str = "all", +) -> None: + """ + Pretune cycle for one tune module. + + When build_one_module is provided (setup.py / PRETUNE_MODULES path): + 1. Build the tune .so (gen_instances.py --tune → all candidate kernels). + 2. Write a temp untune CSV with all unique shape keys from the merged tune CSV, + no gfx/cu_num — the tuner auto-fills those from the live GPU. + 3. Run the tune script with --all so every shape is re-benchmarked. + 4. Rebuild the inference .so (gen_instances.py --tune_file → winners only). + + When build_one_module is None (standalone / direct path): + Steps 2-3 run as above. Step 1 is skipped — the tune script JIT-builds + the tune .so on first invocation. Step 4 uses core.build_module() directly + (no PREBUILD_KERNELS flag injection). Results are written back to the + primary source CSV rather than the ephemeral /tmp merged path. + """ + _log = print if build_one_module is None else logger.info + _warn = print if build_one_module is None else logger.warning + + tune_script, config_attr = _resolve(module_name, cfg, csrc_dir) + + if not tune_script: + _warn(f"[pretune] {module_name}: no tune script available. Skipping.") + return + if not config_attr: + _warn(f"[pretune] {module_name}: cannot determine CSV config attr. Skipping.") + return + + # Merged tune file: used as shape source (includes model_config CSVs). + # In setup.py mode write_tune_file == tune_file (merged /tmp path). + # In direct mode write_tune_file is the primary source CSV so that new rows + # are written back to the repo CSV, not the ephemeral /tmp merge. + tune_file = getattr(core.AITER_CONFIGS, config_attr) + if build_one_module is None: + # The source CSV paths are module-level variables in core (e.g. + # AITER_CONFIG_GEMM_A8W8_BLOCKSCALE), not instance attributes on + # AITER_CONFIGS. Strip the _FILE suffix to derive the variable name. + source_attr = config_attr.removesuffix("_FILE") + source_paths_str = getattr(core, source_attr, None) + write_tune_file = ( + source_paths_str.split(os.pathsep)[0] if source_paths_str else tune_file + ) + else: + write_tune_file = tune_file + + _log( + f"[pretune] {module_name}: " + f"script={os.path.relpath(tune_script, repo_dir)}, " + f"tune_file={tune_file}" + ) + + # ── 1. Build tune .so (setup.py path only) ──────────────────────────── + if build_one_module is not None: + tune_args = core.get_args_of_build(ops_name=module_name) + if isinstance(tune_args, dict) and tune_args.get("srcs"): + logger.info(f"[pretune] building {module_name}") + build_one_module(tune_args) + else: + logger.warning( + f"[pretune] get_args_of_build({module_name!r}) returned no srcs. " + "Tune .so may already exist or module is unknown." + ) + + # ── 2. Write untune CSV ──────────────────────────────────────────────── + # Shape key columns only (no gfx/cu_num). B included for batched GEMM; + # silently dropped if absent. + # With --all + untune_file != tune_file, get_retune_gemm_list() else-branch + # auto-tags rows with live GPU's gfx/cu_num, re-benchmarks shapes already + # in tune_file, and tunes shapes not yet present for this GPU. + shape_keys = ["B", "M", "N", "K"] + untune_csv = _make_untune_csv(tune_file, shape_keys) + + try: + # ── 3. Run tuner ─────────────────────────────────────────────────── + env = { + **os.environ, + "PYTHONPATH": f"{repo_dir}{os.pathsep}{os.environ.get('PYTHONPATH', '')}", + } + cmd = [ + sys.executable, + tune_script, + "--untune_file", + untune_csv, + "--tune_file", + write_tune_file, + "--libtype", + libtype, + "--all", + ] + _log(f"[pretune] running: {' '.join(cmd)}") + result = subprocess.run(cmd, env=env) + if result.returncode != 0: + _warn( + f"[pretune] tuner exited {result.returncode} for {module_name}. " + "Inference module will still be rebuilt with whatever was written." + ) + finally: + try: + os.unlink(untune_csv) + except OSError: + pass + + # ── 4. Rebuild inference .so ─────────────────────────────────────────── + inf_module = re.sub(r"_cktile_tune$|_tune$", "", module_name) + _log(f"[pretune] rebuilding inference module {inf_module}") + core.rm_module(inf_module) + core.clear_build(inf_module) + inf_args = core.get_args_of_build(ops_name=inf_module) + if isinstance(inf_args, dict) and inf_args.get("srcs"): + if build_one_module is not None: + build_one_module(inf_args) + else: + core.build_module( + md_name=inf_args["md_name"], + srcs=inf_args["srcs"], + flags_extra_cc=inf_args["flags_extra_cc"], + flags_extra_hip=inf_args["flags_extra_hip"], + blob_gen_cmd=inf_args["blob_gen_cmd"], + extra_include=inf_args["extra_include"], + extra_ldflags=None, + verbose=False, + is_python_module=True, + is_standalone=False, + torch_exclude=False, + third_party=inf_args["third_party"], + ) + else: + _warn( + f"[pretune] get_args_of_build({inf_module!r}) returned no srcs. " + "Inference module not rebuilt." + ) + + +def run_pretune_modules( + pretune_env: str, + cfg: dict, + core, + build_one_module, + csrc_dir: str, + repo_dir: str, +) -> None: + """ + Parse PRETUNE_MODULES and dispatch run_pretune() for each requested module. + + pretune_env values: + "all" → every supported _tune module in config + "module_gemm_a8w8_blockscale_tune" → single module + "module_gemm_a8w8_tune,module_gemm_a8w8_blockscale_tune" → comma list + """ + modules = _parse_module_list(pretune_env, cfg) + logger.info(f"[pretune] PRETUNE_MODULES → {len(modules)} modules to tune") + + seen_keys: set = set() + for mod in modules: + script, attr = _resolve(mod, cfg, csrc_dir) + key = (script, attr) + if key in seen_keys: + logger.warning( + f"[pretune] {mod}: same script+CSV already queued by an earlier module. " + "Skipping duplicate." + ) + continue + seen_keys.add(key) + try: + run_pretune( + mod, cfg, core, csrc_dir, repo_dir, build_one_module=build_one_module + ) + except Exception as exc: + logger.warning( + f"[pretune] {mod} failed: {exc}. Continuing with remaining modules." + ) + + +def _main() -> None: + import argparse + + # Auto-detect repo root from this file's location: utility/pretune.py → repo root + _this_dir = os.path.dirname(os.path.abspath(__file__)) + _default_repo_dir = os.path.dirname(os.path.dirname(_this_dir)) + + parser = argparse.ArgumentParser( + description=( + "Tune GEMM shapes for the live GPU on an already-installed aiter.\n\n" + "Examples:\n" + " python3 aiter/utility/pretune.py module_gemm_a8w8_blockscale_tune\n" + " python3 aiter/utility/pretune.py module_gemm_a8w8_tune,module_gemm_a8w8_blockscale_tune\n" + " python3 aiter/utility/pretune.py all\n" + " python3 aiter/utility/pretune.py --list" + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "modules", + nargs="?", + help="Module name, comma-separated list, or 'all'.", + ) + parser.add_argument( + "--libtype", + default="all", + choices=["ck", "cktile", "all"], + help="Kernel families to tune (default: all).", + ) + parser.add_argument( + "--list", + action="store_true", + help="Print available tune modules and exit.", + ) + parser.add_argument( + "--repo_dir", + default=_default_repo_dir, + help="Path to aiter repo root (auto-detected by default).", + ) + args = parser.parse_args() + + repo_dir = os.path.abspath(args.repo_dir) + csrc_dir = os.path.join(repo_dir, "csrc") + cfg_path = os.path.join(repo_dir, "aiter", "jit", "optCompilerConfig.json") + + with open(cfg_path, "r", encoding="utf-8") as f: + cfg = json.load(f) + + _unsupported = {m for m, v in _SCRIPT_FALLBACK.items() if v is None} + + if args.list: + all_modules = _all_tune_modules(cfg) + supported = [m for m in all_modules if m not in _unsupported] + print(f"Available tune modules ({len(supported)}):") + for m in sorted(supported): + _, config_attr = _resolve(m, cfg, csrc_dir) + print(f" {m:<55} {config_attr or 'unknown config'}") + return + + if not args.modules: + parser.print_help() + return + + all_known = set(_all_tune_modules(cfg)) + modules = _parse_module_list(args.modules, cfg) + if args.modules.strip().lower() == "all": + print(f"[pretune] tuning all {len(modules)} supported tune modules") + + # Deferred import: core requires torch, not available during CI metadata phase + sys.path.insert(0, os.path.join(repo_dir, "aiter")) + from jit import core # noqa: PLC0415 + + seen_keys: set = set() + for mod in modules: + if mod in _unsupported: + print( + f"[pretune] {mod}: not supported (no tune script writes to this CSV). " + "Skipping." + ) + continue + if mod not in all_known: + print( + f"[pretune] {mod}: unknown module. Run --list to see available modules." + ) + continue + script, attr = _resolve(mod, cfg, csrc_dir) + key = (script, attr) + if key in seen_keys: + print( + f"[pretune] {mod}: same script+CSV already queued by an earlier module. " + "Skipping duplicate." + ) + continue + seen_keys.add(key) + try: + run_pretune(mod, cfg, core, csrc_dir, repo_dir, libtype=args.libtype) + except Exception as exc: + print(f"[pretune] {mod} failed: {exc}. Continuing.") + + +if __name__ == "__main__": + _main() diff --git a/csrc/ck_batched_gemm_a8w8/README.md b/csrc/ck_batched_gemm_a8w8/README.md index 1b7e00338a..65ed237bd9 100644 --- a/csrc/ck_batched_gemm_a8w8/README.md +++ b/csrc/ck_batched_gemm_a8w8/README.md @@ -13,11 +13,11 @@ Run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_a8w8_tune via jit: `python3 csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py -i aiter/configs/a8w8_untuned_batched_gemm.csv -o aiter/configs/a8w8_tuned_batched_gemm.csv` You can find the results of the tuning in `aiter/configs/a8w8_tuned_batched_gemm.csv`, like this: - |**cu_num**|**B**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| - |----------|-----|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| - |80 |16 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | + |**gfx** |**cu_num**|**B**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |---------|----------|-----|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| + |gfx942 |80 |16 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | - `cu_num` means the number of compute units, and it is used to distinguish between graphics. + `gfx` identifies the GPU architecture (e.g. `gfx942`, `gfx950`). `cu_num` is the number of compute units and distinguishes partitioned or binned variants of the same architecture (e.g. MI308X vs MI300X both use `gfx942`). 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_batched_gemm_a8w8.py` and run it, please wait a few minutes as it will build batched_gemm_a8w8 tuned kernels in `aiter/configs/a8w8_tuned_batched_gemm.csv` via jit: diff --git a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8.cu b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8.cu index 4419eaf367..b63007bdb0 100644 --- a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8.cu +++ b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8.cu @@ -4,6 +4,7 @@ #include "batched_gemm_a8w8_common.cuh" #include "batched_gemm_a8w8_manifest.h" #include "batched_gemm_a8w8_lookup.h" +#include "gemm_dispatch_utils.h" #include using BatchedRowwiseKernel = std::function< @@ -12,25 +13,9 @@ using BatchedRowwiseKernel = std::function< torch::Tensor &, std::optional, int)>; -// Define a custom hash function for std::tuple -struct IntTupleHash -{ - size_t operator()(const std::tuple &t) const - { - auto hash1 = std::hash{}(std::get<0>(t)); - auto hash2 = std::hash{}(std::get<1>(t)); - auto hash3 = std::hash{}(std::get<2>(t)); - auto hash4 = std::hash{}(std::get<3>(t)); - return hash1 ^ hash2 ^ hash3 ^ hash4; - } -}; - // For certain high priority shapes, we directly use the best kernel rather // than use heuristics. -using BatchedRowwiseKernelMap = std::unordered_map< - std::tuple, - BatchedRowwiseKernel, - IntTupleHash>; +using BatchedRowwiseKernelMap = BatchedGemmDispatchMap; template BatchedRowwiseKernel batched_rowwise_heuristic_dispatch(int B, int M, int N, int K) @@ -113,9 +98,12 @@ BatchedRowwiseKernel batched_rowwise_dispatch(int B, int M, int N, int K) static_assert(false, "batched_rowwise_dispatch used with unsupported dtype!"); } }(); - - // First check if this shape(M,N,K) is available in the direct lookup. - auto it = lookup.find({B, M, N, K}); + + const int cu_num = get_device_cu_num(); + const std::string& gfx = get_device_gfx(); + + // First check if this shape(B,M,N,K) is available in the direct lookup. + auto it = lookup.find({gfx, cu_num, B, M, N, K}); // If we found an optimal kernel, use it. if (it != lookup.end()) { @@ -135,8 +123,8 @@ BatchedRowwiseKernel batched_rowwise_dispatch(int B, int M, int N, int K) { padded_m = 20480; } - // Second check if this shape(padded_m,N,K) is available in the direct lookup. - it = lookup.find({B, padded_m, N, K}); + // Second check if this shape(B,padded_m,N,K) is available in the direct lookup. + it = lookup.find({gfx, cu_num, B, padded_m, N, K}); // If we found an optimal kernel, use it. if (it != lookup.end()) { diff --git a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py index 27797405e4..25cecede8f 100644 --- a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py +++ b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py @@ -113,7 +113,7 @@ def calculate(self, results, bpes=(1, 1, 2)): if time == -1: return -1, -1 print(info[0]) - cu_num, b, m, n, k = info[0] + gfx, cu_num, b, m, n, k = info[0] flops = m * n * k * 2 * b tflops = round(flops / (time * 1000000), 2) lhs_bpe, rhs_bpe, out_bpe = bpes @@ -142,6 +142,7 @@ def tune( shape_grouped = args.shape_grouped errRatio = args.errRatio cu_num = self.get_cu_num() + gfx = self.get_gfx() task = [] tasks_data = [] for i in range(len(untunedf)): @@ -172,7 +173,7 @@ def tune( else 0 ) for splitK in range(maxsplitK + 1): - info = ((cu_num, B, M, N, K), kid, splitK, "") + info = ((gfx, cu_num, B, M, N, K), kid, splitK, "") task.append( ( info, @@ -218,6 +219,7 @@ def tune( if __name__ == "__main__": key = [ + "gfx", "cu_num", "B", "M", diff --git a/csrc/ck_batched_gemm_a8w8/gen_instances.py b/csrc/ck_batched_gemm_a8w8/gen_instances.py index e089a41808..a13a58283e 100644 --- a/csrc/ck_batched_gemm_a8w8/gen_instances.py +++ b/csrc/ck_batched_gemm_a8w8/gen_instances.py @@ -2,12 +2,28 @@ # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. import argparse import os +import sys import shutil from pathlib import Path import pandas as pd -import torch -from batched_gemm_a8w8_common import default_kernels_dict, kernelInstance, kernels_list + +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_CORE_DIR = ( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter/jit/utils") + if os.path.exists( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter_meta") + ) + else os.path.abspath(f"{this_dir}/../../aiter/jit/utils") +) +sys.path.insert(0, AITER_CORE_DIR) +from chip_info import build_tune_dict_batched, write_lookup_header # noqa: E402 + +from batched_gemm_a8w8_common import ( # noqa: E402 + default_kernels_dict, + kernelInstance, + kernels_list, +) class batched_gemm_a8w8_fwd_codegen: @@ -175,7 +191,7 @@ def gen_lookup_dict(self, kernels_dict): { \\""" LOOKUP_template = """ - {{{mnk}, \\ + {{{MNK}, \\ {kernel_name}}}, \\""" LOOKUP_end = """ @@ -183,24 +199,14 @@ def gen_lookup_dict(self, kernels_dict): #endif // USE_ROCM """ - with open( - os.path.join(self.working_path, "batched_gemm_a8w8_lookup.h"), "w" - ) as f: - f.write(LOOKUP_head) - for mnk, k in kernels_dict.items(): - # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) - if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): - f.write( - LOOKUP_template.format( - mnk="{" - + (", ").join(map(lambda x: str(x), list(mnk))) - + "}", - kernel_name=k.name, - ) - ) - elif self.istune and isinstance(mnk, int): - f.write(LOOKUP_template.format(mnk=mnk, kernel_name=k.name)) - f.write(LOOKUP_end) + write_lookup_header( + os.path.join(self.working_path, "batched_gemm_a8w8_lookup.h"), + kernels_dict, + LOOKUP_head, + LOOKUP_template, + LOOKUP_end, + self.istune, + ) def gen_manifest_head(self, kernels_dict): MAINFEST_head = """#pragma once @@ -254,22 +260,11 @@ def gen_instances(self, kernels_dict): def get_tune_dict(tune_dict_csv): - tune_dict = default_kernels_dict if os.path.exists(tune_dict_csv): - tune_df = pd.read_csv(tune_dict_csv) - if torch.cuda.is_available(): - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = device_properties.multi_processor_count - tune_df = tune_df[tune_df["cu_num"] == cu_num].reset_index() - for i in range(len(tune_df)): - B = tune_df.loc[i, "B"] - M = tune_df.loc[i, "M"] - N = tune_df.loc[i, "N"] - K = tune_df.loc[i, "K"] - kid = tune_df.loc[i, "kernelId"] - tune_dict[(B, M, N, K)] = kernels_list[kid] - return tune_dict + return build_tune_dict_batched( + pd.read_csv(tune_dict_csv), default_kernels_dict, kernels_list + ) + return default_kernels_dict if __name__ == "__main__": diff --git a/csrc/ck_batched_gemm_bf16/README.md b/csrc/ck_batched_gemm_bf16/README.md index 714435e3c0..0df7b0bcb4 100644 --- a/csrc/ck_batched_gemm_bf16/README.md +++ b/csrc/ck_batched_gemm_bf16/README.md @@ -13,11 +13,11 @@ Run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_bf16_tune via jit: `python3 csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py -i aiter/configs/bf16_untuned_batched_gemm.csv -o aiter/configs/bf16_tuned_batched_gemm.csv` You can find the results of the tuning in `aiter/configs/bf16_tuned_batched_gemm.csv`, like this: - |**cu_num**|**B**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| - |----------|-----|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| - |80 |16 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | + |**gfx** |**cu_num**|**B**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |---------|----------|-----|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| + |gfx942 |80 |16 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | - `cu_num` means the number of compute units, and it is used to distinguish between graphics. + `gfx` identifies the GPU architecture (e.g. `gfx942`, `gfx950`). `cu_num` is the number of compute units and distinguishes partitioned or binned variants of the same architecture (e.g. MI308X vs MI300X both use `gfx942`). 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_batched_gemm_bf16.py` and run it, please wait a few minutes as it will build batched_gemm_bf16 tuned kernels in `aiter/configs/bf16_tuned_batched_gemm.csv` via jit: diff --git a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16.cu b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16.cu index 2947b070ef..30e9d0e00f 100644 --- a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16.cu +++ b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16.cu @@ -4,6 +4,7 @@ #include "batched_gemm_bf16_common.cuh" #include "batched_gemm_bf16_manifest.h" #include "batched_gemm_bf16_lookup.h" +#include "gemm_dispatch_utils.h" #include using BatchedKernel = std::function< @@ -11,25 +12,9 @@ using BatchedKernel = std::function< torch::Tensor &, std::optional, int)>; -// Define a custom hash function for std::tuple -struct IntTupleHash -{ - size_t operator()(const std::tuple &t) const - { - auto hash1 = std::hash{}(std::get<0>(t)); - auto hash2 = std::hash{}(std::get<1>(t)); - auto hash3 = std::hash{}(std::get<2>(t)); - auto hash4 = std::hash{}(std::get<3>(t)); - return hash1 ^ hash2 ^ hash3 ^ hash4; - } -}; - // For certain high priority shapes, we directly use the best kernel rather // than use heuristics. -using BatchedKernelMap = std::unordered_map< - std::tuple, - BatchedKernel, - IntTupleHash>; +using BatchedKernelMap = BatchedGemmDispatchMap; BatchedKernel batched_heuristic_dispatch(int B, int M, int N, int K) { @@ -104,9 +89,12 @@ BatchedKernel batched_dispatch(int B, int M, int N, int K) { return BatchedKernelMap{GENERATE_LOOKUP_TABLE()}; }(); - - // First check if this shape(M,N,K) is available in the direct lookup. - auto it = lookup.find({B, M, N, K}); + + const int cu_num = get_device_cu_num(); + const std::string& gfx = get_device_gfx(); + + // First check if this shape(B,M,N,K) is available in the direct lookup. + auto it = lookup.find({gfx, cu_num, B, M, N, K}); // If we found an optimal kernel, use it. if (it != lookup.end()) { @@ -126,8 +114,8 @@ BatchedKernel batched_dispatch(int B, int M, int N, int K) { padded_m = 20480; } - // Second check if this shape(padded_m,N,K) is available in the direct lookup. - it = lookup.find({B, padded_m, N, K}); + // Second check if this shape(B,padded_m,N,K) is available in the direct lookup. + it = lookup.find({gfx, cu_num, B, padded_m, N, K}); // If we found an optimal kernel, use it. if (it != lookup.end()) { diff --git a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py index 74e5fa936c..61b4f1be37 100644 --- a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py +++ b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py @@ -92,7 +92,7 @@ def calculate(self, results, bpes=(2, 2, 2)): info, time, err_ratio = results if time == -1: return -1, -1 - cu_num, b, m, n, k = info[0] + gfx, cu_num, b, m, n, k = info[0] flops = m * n * k * 2 * b tflops = round(flops / (time * 1000000), 2) lhs_bpe, rhs_bpe, out_bpe = bpes @@ -121,6 +121,7 @@ def tune( shape_grouped = args.shape_grouped errRatio = args.errRatio cu_num = self.get_cu_num() + gfx = self.get_gfx() task = [] tasks_data = [] @@ -149,7 +150,7 @@ def tune( else 0 ) for splitK in range(maxsplitK + 1): - info = ((cu_num, B, M, N, K), kid, splitK, "") + info = ((gfx, cu_num, B, M, N, K), kid, splitK, "") task.append( ( info, @@ -195,6 +196,7 @@ def tune( if __name__ == "__main__": key = [ + "gfx", "cu_num", "B", "M", diff --git a/csrc/ck_batched_gemm_bf16/gen_instances.py b/csrc/ck_batched_gemm_bf16/gen_instances.py index 0f1a11ed1d..8cb7f01481 100644 --- a/csrc/ck_batched_gemm_bf16/gen_instances.py +++ b/csrc/ck_batched_gemm_bf16/gen_instances.py @@ -2,12 +2,28 @@ # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. import argparse import os +import sys import shutil from pathlib import Path import pandas as pd -import torch -from batched_gemm_bf16_common import default_kernels_dict, kernelInstance, kernels_list + +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_CORE_DIR = ( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter/jit/utils") + if os.path.exists( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter_meta") + ) + else os.path.abspath(f"{this_dir}/../../aiter/jit/utils") +) +sys.path.insert(0, AITER_CORE_DIR) +from chip_info import build_tune_dict_batched, write_lookup_header # noqa: E402 + +from batched_gemm_bf16_common import ( # noqa: E402 + default_kernels_dict, + kernelInstance, + kernels_list, +) class batched_gemm_bf16_fwd_codegen: @@ -161,7 +177,7 @@ def gen_lookup_dict(self, kernels_dict): { \\""" LOOKUP_template = """ - {{{mnk}, \\ + {{{MNK}, \\ {kernel_name}}}, \\""" LOOKUP_end = """ @@ -169,24 +185,14 @@ def gen_lookup_dict(self, kernels_dict): #endif // USE_ROCM """ - with open( - os.path.join(self.working_path, "batched_gemm_bf16_lookup.h"), "w" - ) as f: - f.write(LOOKUP_head) - for mnk, k in kernels_dict.items(): - # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) - if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): - f.write( - LOOKUP_template.format( - mnk="{" - + (", ").join(map(lambda x: str(x), list(mnk))) - + "}", - kernel_name=k.name, - ) - ) - elif self.istune and isinstance(mnk, int): - f.write(LOOKUP_template.format(mnk=mnk, kernel_name=k.name)) - f.write(LOOKUP_end) + write_lookup_header( + os.path.join(self.working_path, "batched_gemm_bf16_lookup.h"), + kernels_dict, + LOOKUP_head, + LOOKUP_template, + LOOKUP_end, + self.istune, + ) def gen_manifest_head(self, kernels_dict): MAINFEST_head = """#pragma once @@ -237,22 +243,11 @@ def gen_instances(self, kernels_dict): def get_tune_dict(tune_dict_csv): - tune_dict = default_kernels_dict if os.path.exists(tune_dict_csv): - tune_df = pd.read_csv(tune_dict_csv) - if torch.cuda.is_available(): - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = device_properties.multi_processor_count - tune_df = tune_df[tune_df["cu_num"] == cu_num].reset_index() - for i in range(len(tune_df)): - B = tune_df.loc[i, "B"] - M = tune_df.loc[i, "M"] - N = tune_df.loc[i, "N"] - K = tune_df.loc[i, "K"] - kid = tune_df.loc[i, "kernelId"] - tune_dict[(B, M, N, K)] = kernels_list[kid] - return tune_dict + return build_tune_dict_batched( + pd.read_csv(tune_dict_csv), default_kernels_dict, kernels_list + ) + return default_kernels_dict if __name__ == "__main__": diff --git a/csrc/ck_deepgemm/deepgemm.cu b/csrc/ck_deepgemm/deepgemm.cu index 4658150779..6d4631417d 100644 --- a/csrc/ck_deepgemm/deepgemm.cu +++ b/csrc/ck_deepgemm/deepgemm.cu @@ -4,6 +4,7 @@ #include "deepgemm_common.cuh" #include "deepgemm_lookup.h" #include "deepgemm_manifest.h" +#include "gemm_dispatch_utils.h" #include #include "py_itfs_common.h" @@ -12,24 +13,9 @@ using RowwiseKernel = std::function< torch::Tensor &, torch::Tensor &, std::optional, std::optional)>; -// Define a custom hash function for std::tuple -struct IntTupleHash -{ - size_t operator()(const std::tuple &t) const - { - auto hash1 = std::hash{}(std::get<0>(t)); - auto hash2 = std::hash{}(std::get<1>(t)); - auto hash3 = std::hash{}(std::get<2>(t)); - return hash1 ^ hash2 ^ hash3; - } -}; - // For certain high priority shapes, we directly use the best kernel rather // than use heuristics. -using RowwiseKernelMap = std::unordered_map< - std::tuple, - RowwiseKernel, - IntTupleHash>; +using RowwiseKernelMap = GemmDispatchMap; template RowwiseKernel rowwise_heuristic_dispatch(int M, int N, int K) diff --git a/csrc/ck_deepgemm/gen_instances.py b/csrc/ck_deepgemm/gen_instances.py index 07514424ab..5ef6a8b3cd 100644 --- a/csrc/ck_deepgemm/gen_instances.py +++ b/csrc/ck_deepgemm/gen_instances.py @@ -1,12 +1,28 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. import os +import sys from pathlib import Path import pandas as pd import argparse import shutil -import torch -from deepgemm_common import kernelInstance, kernels_list, default_kernels_dict + +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_CORE_DIR = ( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter/jit/utils") + if os.path.exists( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter_meta") + ) + else os.path.abspath(f"{this_dir}/../../aiter/jit/utils") +) +sys.path.insert(0, AITER_CORE_DIR) +from chip_info import build_tune_dict, write_lookup_header # noqa: E402 + +from deepgemm_common import ( # noqa: E402 + kernelInstance, + kernels_list, + default_kernels_dict, +) class deepgemm_codegen: @@ -172,21 +188,14 @@ def gen_lookup_dict(self, kernels_dict): } // #endif // USE_ROCM """ - with open(os.path.join(self.working_path, "deepgemm_lookup.h"), "w") as f: - f.write(LOOKUP_head) - for mnk, k in kernels_dict.items(): - if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): - f.write( - LOOKUP_template.format( - MNK="{" - + (", ").join(map(lambda x: str(x), list(mnk))) - + "}", - kernel_name=k.name, - ) - ) - elif self.istune and isinstance(mnk, int): - f.write(LOOKUP_template.format(MNK=mnk, kernel_name=k.name)) - f.write(LOOKUP_end) + write_lookup_header( + os.path.join(self.working_path, "deepgemm_lookup.h"), + kernels_dict, + LOOKUP_head, + LOOKUP_template, + LOOKUP_end, + self.istune, + ) def gen_manifest_head(self, kernels_dict): MAINFEST_head = """#pragma once @@ -233,21 +242,11 @@ def gen_instances(self, kernels_dict): def get_tune_dict(tune_dict_csv): - tune_dict = default_kernels_dict if os.path.exists(tune_dict_csv): - tune_df = pd.read_csv(tune_dict_csv) - if torch.cuda.is_available(): - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = device_properties.multi_processor_count - tune_df = tune_df[tune_df["cu_num"] == cu_num].reset_index() - for i in range(len(tune_df)): - M = tune_df.loc[i, "M"] - N = tune_df.loc[i, "N"] - K = tune_df.loc[i, "K"] - kid = tune_df.loc[i, "kernelId"] - tune_dict[(M, N, K)] = kernels_list[kid] - return tune_dict + return build_tune_dict( + pd.read_csv(tune_dict_csv), default_kernels_dict, kernels_list + ) + return default_kernels_dict if __name__ == "__main__": diff --git a/csrc/ck_gemm_a4w4_blockscale/README.md b/csrc/ck_gemm_a4w4_blockscale/README.md index f95cbbbba2..8b1e17cbdb 100755 --- a/csrc/ck_gemm_a4w4_blockscale/README.md +++ b/csrc/ck_gemm_a4w4_blockscale/README.md @@ -13,11 +13,11 @@ Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a4w4_blockscale_tune via jit: `GEMM_A4W4_BLOCKWISE_HIP_CLANG_PATH=/work/llvm-project/build/bin/ python3 csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py -i aiter/configs/a4w4_blockscale_untuned_gemm.csv -o aiter/configs/a4w4_blockscale_tuned_gemm.csv` You can find the results of the tuning in `aiter/configs/a4w4_blockscale_tuned_gemm.csv`, like this: - |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| - |----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| - |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | + |**gfx** |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |---------|----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| + |gfx942 |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | - `cu_num` means the number of compute units, and it is used to distinguish between graphics. + `gfx` identifies the GPU architecture (e.g. `gfx942`, `gfx950`). `cu_num` is the number of compute units and distinguishes partitioned or binned variants of the same architecture (e.g. MI308X vs MI300X both use `gfx942`). 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_gemm_a4w4_blockscale.py` and run it, please wait a few minutes as it will build gemm_a4w4_blockscale tuned kernels in `aiter/configs/a4w4_blockscale_tuned_gemm.csv` via jit: diff --git a/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale.cu b/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale.cu index 3fc3385871..b28c27e4f0 100755 --- a/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale.cu +++ b/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale.cu @@ -5,6 +5,7 @@ #include "gemm_a4w4_blockscale_manifest.h" #include "gemm_a4w4_blockscale_lookup.h" #include "gemm_common.h" +#include "gemm_dispatch_utils.h" #include using BlockwiseKernel = std::function< @@ -12,22 +13,7 @@ using BlockwiseKernel = std::function< torch::Tensor &, torch::Tensor &, torch::Tensor &, int)>; -// Define a custom hash function for std::tuple -struct IntTupleHash -{ - size_t operator()(const std::tuple &t) const - { - auto hash1 = std::hash{}(std::get<0>(t)); - auto hash2 = std::hash{}(std::get<1>(t)); - auto hash3 = std::hash{}(std::get<2>(t)); - return hash1 ^ hash2 ^ hash3; - } -}; - -using BlockwiseKernelMap = std::unordered_map< - std::tuple, - BlockwiseKernel, - IntTupleHash>; +using BlockwiseKernelMap = GemmDispatchMap; template BlockwiseKernel blockscale_dispatch(int M, int N, int K) @@ -46,8 +32,11 @@ BlockwiseKernel blockscale_dispatch(int M, int N, int K) static_assert(false, "blockscale_dispatch used with unsupported dtype!"); } }(); + const int cu_num = get_device_cu_num(); + const std::string& gfx = get_device_gfx(); + // First check if this shape(M,N,K) is available in the direct lookup. - auto it = lookup.find({M, N, K}); + auto it = lookup.find({gfx, cu_num, M, N, K}); // If we found an optimal kernel, use it. if (it != lookup.end()) { @@ -59,7 +48,7 @@ BlockwiseKernel blockscale_dispatch(int M, int N, int K) padded_m = getPaddedM(M, N, K, 0); // Second check if this shape(padded_m,N,K) is available in the direct lookup. - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); // If we found an optimal kernel, use it. if (it != lookup.end()) { @@ -67,7 +56,7 @@ BlockwiseKernel blockscale_dispatch(int M, int N, int K) } // Coarse-grained search padded_m = getPaddedM(M, N, K, 1); - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); if (it != lookup.end()) { return it->second; diff --git a/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py b/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py index 52076538cd..6cf35f6935 100755 --- a/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py +++ b/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py @@ -226,14 +226,13 @@ def tune( mp_num = args.mp shape_grouped = args.shape_grouped errRatio = args.errRatio - from aiter.jit.utils.chip_info import get_gfx + from aiter.jit.utils.chip_info import get_gfx_runtime as get_gfx if get_gfx() not in ["gfx950"]: print(f"tuning is not supported in this chip {get_gfx()}") return [] - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = int(device_properties.multi_processor_count) + gfx = self.get_gfx() + cu_num = self.get_cu_num() task = [] tasks_in_data = [] @@ -265,7 +264,7 @@ def tune( else 0 ) for splitK in range(maxsplitK + 1): - info = ((cu_num, M, N, K), kernel_idx, splitK, "") + info = ((gfx, cu_num, M, N, K), kernel_idx, splitK, "") task.append( ( info, @@ -313,7 +312,7 @@ def tune( maxsplitK = 0 for splitK in range(maxsplitK + 1): kernel_name = kernelName[0] - info = ((cu_num, M, N, K), asm_kernels_id, splitK, kernel_name) + info = ((gfx, cu_num, M, N, K), asm_kernels_id, splitK, kernel_name) task.append( ( info, diff --git a/csrc/ck_gemm_a4w4_blockscale/gen_instances.py b/csrc/ck_gemm_a4w4_blockscale/gen_instances.py index 9e9da1346a..ee6171ec74 100755 --- a/csrc/ck_gemm_a4w4_blockscale/gen_instances.py +++ b/csrc/ck_gemm_a4w4_blockscale/gen_instances.py @@ -2,12 +2,24 @@ # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. import argparse import os +import sys import shutil from pathlib import Path import pandas as pd -import torch -from gemm_a4w4_blockscale_common import ( + +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_CORE_DIR = ( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter/jit/utils") + if os.path.exists( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter_meta") + ) + else os.path.abspath(f"{this_dir}/../../aiter/jit/utils") +) +sys.path.insert(0, AITER_CORE_DIR) +from chip_info import build_tune_dict, write_lookup_header # noqa: E402 + +from gemm_a4w4_blockscale_common import ( # noqa: E402 default_kernels_dict, kernelInstance, kernels_list, @@ -153,24 +165,14 @@ def gen_lookup_dict(self, kernels_dict): #endif // USE_ROCM """ - with open( - os.path.join(self.working_path, "gemm_a4w4_blockscale_lookup.h"), "w" - ) as f: - f.write(LOOKUP_head) - for mnk, k in kernels_dict.items(): - # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) - if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): - f.write( - LOOKUP_template.format( - MNK="{" - + (", ").join(map(lambda x: str(x), list(mnk))) - + "}", - kernel_name=k.name, - ) - ) - elif self.istune and isinstance(mnk, int): - f.write(LOOKUP_template.format(MNK=mnk, kernel_name=k.name)) - f.write(LOOKUP_end) + write_lookup_header( + os.path.join(self.working_path, "gemm_a4w4_blockscale_lookup.h"), + kernels_dict, + LOOKUP_head, + LOOKUP_template, + LOOKUP_end, + self.istune, + ) def gen_manifest_head(self, kernels_dict): MAINFEST_head = """#pragma once @@ -223,23 +225,11 @@ def gen_instances(self, kernels_dict): def get_tune_dict(tune_dict_csv): - tune_dict = default_kernels_dict if os.path.exists(tune_dict_csv): - tune_df = pd.read_csv(tune_dict_csv) - if torch.cuda.is_available(): - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = device_properties.multi_processor_count - tune_df = tune_df[tune_df["cu_num"] == cu_num].reset_index() - for i in range(len(tune_df)): - M = tune_df.loc[i, "M"] - N = tune_df.loc[i, "N"] - K = tune_df.loc[i, "K"] - kid = tune_df.loc[i, "kernelId"] - if kid < 0 or kid > len(kernels_list): - continue - tune_dict[(M, N, K)] = kernels_list[kid] - return tune_dict + return build_tune_dict( + pd.read_csv(tune_dict_csv), default_kernels_dict, kernels_list + ) + return default_kernels_dict if __name__ == "__main__": diff --git a/csrc/ck_gemm_a8w8/README.md b/csrc/ck_gemm_a8w8/README.md index 857d193e99..ddd548a10f 100644 --- a/csrc/ck_gemm_a8w8/README.md +++ b/csrc/ck_gemm_a8w8/README.md @@ -13,11 +13,11 @@ Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_tune via jit: `python3 csrc/ck_gemm_a8w8/gemm_a8w8_tune.py -i aiter/configs/a8w8_untuned_gemm.csv -o aiter/configs/a8w8_tuned_gemm.csv` You can find the results of this tuning in `aiter/configs/a8w8_tuned_gemm.csv`, like this: - |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| - |----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| - |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | + |**gfx** |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |---------|----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| + |gfx942 |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | - `cu_num` means the number of compute units, and it is used to distinguish between graphics. + `gfx` identifies the GPU architecture (e.g. `gfx942`, `gfx950`). `cu_num` is the number of compute units and distinguishes partitioned or binned variants of the same architecture (e.g. MI308X vs MI300X both use `gfx942`). 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_gemm_a8w8.py` and run it, please wait a few minutes as it will build gemm_a8w8 tuned kernels in `aiter/configs/a8w8_tuned_gemm.csv` via jit: diff --git a/csrc/ck_gemm_a8w8/gemm_a8w8.cu b/csrc/ck_gemm_a8w8/gemm_a8w8.cu index 69eb548023..ac9532e58d 100644 --- a/csrc/ck_gemm_a8w8/gemm_a8w8.cu +++ b/csrc/ck_gemm_a8w8/gemm_a8w8.cu @@ -4,6 +4,7 @@ #include "gemm_a8w8_common.cuh" #include "gemm_a8w8_manifest.h" #include "gemm_a8w8_lookup.h" +#include "gemm_dispatch_utils.h" #include #include "py_itfs_common.h" @@ -13,24 +14,9 @@ using RowwiseKernel = std::function< torch::Tensor &, std::optional, int)>; -// Define a custom hash function for std::tuple -struct IntTupleHash -{ - size_t operator()(const std::tuple &t) const - { - auto hash1 = std::hash{}(std::get<0>(t)); - auto hash2 = std::hash{}(std::get<1>(t)); - auto hash3 = std::hash{}(std::get<2>(t)); - return hash1 ^ hash2 ^ hash3; - } -}; - // For certain high priority shapes, we directly use the best kernel rather // than use heuristics. -using RowwiseKernelMap = std::unordered_map< - std::tuple, - RowwiseKernel, - IntTupleHash>; +using RowwiseKernelMap = GemmDispatchMap; template RowwiseKernel rowwise_heuristic_dispatch(int M, int N, int K) @@ -112,8 +98,11 @@ RowwiseKernel rowwise_dispatch(int M, int N, int K) return RowwiseKernelMap{GENERATE_LOOKUP_TABLE(ABDataType, DDataType, EDataType)}; }(); + const int cu_num = get_device_cu_num(); + const std::string& gfx = get_device_gfx(); + // First check if this shape(M,N,K) is available in the direct lookup. - auto it = lookup.find({M, N, K}); + auto it = lookup.find({gfx, cu_num, M, N, K}); // If we found an optimal kernel, use it. if (it != lookup.end()) { @@ -134,7 +123,7 @@ RowwiseKernel rowwise_dispatch(int M, int N, int K) padded_m = 20480; } // Second check if this shape(padded_m,N,K) is available in the direct lookup. - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); // If we found an optimal kernel, use it. if (it != lookup.end()) { diff --git a/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py b/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py index 85df81ea54..0c7fa1637d 100644 --- a/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py +++ b/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py @@ -56,7 +56,17 @@ def get_tuned_gemm_list(tuned_gemm_file): tunedf = pd.read_csv(tuned_gemm_file) else: tunedf = pd.DataFrame( - columns=["cu_num", "M", "N", "K", "kernelId", "splitK", "us", "kernelName"] + columns=[ + "gfx", + "cu_num", + "M", + "N", + "K", + "kernelId", + "splitK", + "us", + "kernelName", + ] ) return tunedf @@ -108,11 +118,11 @@ def getKernelName(self, kernelId): return kernels_list[kernelId].name def _clear_op_caches(self): - from aiter.ops.gemm_op_a8w8 import get_GEMM_config_with_quant_type + from aiter.ops import gemm_op_a8w8 as _op - get_GEMM_config_with_quant_type.cache_clear() - if hasattr(get_GEMM_config_with_quant_type, "file_cache"): - get_GEMM_config_with_quant_type.file_cache.clear() + _op.get_GEMM_config_with_quant_type.cache_clear() + _op._GEMM_QUANT_TYPE_CACHE.clear() + _op._GEMM_QUANT_TYPE_HAS_GFX.clear() def _setup_specific_arguments(self): pass @@ -175,6 +185,7 @@ def tune( shape_grouped = args.shape_grouped errRatio = args.errRatio cu_num = self.get_cu_num() + gfx = self.get_gfx() task = [] tasks_data = [] @@ -191,7 +202,7 @@ def tune( kernels_num = len(kernels_list) total_kernel_nums = 0 - info_keys = (cu_num, M, N, K, q_dtype_w) + info_keys = (gfx, cu_num, M, N, K, q_dtype_w) for j in range(kernels_num): kernel = kernels_list[j] @@ -250,7 +261,7 @@ def tune( if __name__ == "__main__": ## use default key and resultList with q_dtype_w support - key = ["cu_num", "M", "N", "K", "q_dtype_w"] + key = ["gfx", "cu_num", "M", "N", "K", "q_dtype_w"] resultList = [ "kernelId", "splitK", diff --git a/csrc/ck_gemm_a8w8/gen_instances.py b/csrc/ck_gemm_a8w8/gen_instances.py index 46d93f57cf..ab328b4517 100644 --- a/csrc/ck_gemm_a8w8/gen_instances.py +++ b/csrc/ck_gemm_a8w8/gen_instances.py @@ -2,12 +2,28 @@ # Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. import argparse import os +import sys import shutil from pathlib import Path import pandas as pd -import torch -from gemm_a8w8_common import default_kernels_dict, kernelInstance, kernels_list + +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_CORE_DIR = ( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter/jit/utils") + if os.path.exists( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter_meta") + ) + else os.path.abspath(f"{this_dir}/../../aiter/jit/utils") +) +sys.path.insert(0, AITER_CORE_DIR) +from chip_info import build_tune_dict, write_lookup_header # noqa: E402 + +from gemm_a8w8_common import ( # noqa: E402 + default_kernels_dict, + kernelInstance, + kernels_list, +) class gemm_a8w8_fwd_codegen: @@ -230,22 +246,14 @@ def gen_lookup_dict(self, kernels_dict): #endif // USE_ROCM """ - with open(os.path.join(self.working_path, "gemm_a8w8_lookup.h"), "w") as f: - f.write(LOOKUP_head) - for mnk, k in kernels_dict.items(): - # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) - if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): - f.write( - LOOKUP_template.format( - MNK="{" - + (", ").join(map(lambda x: str(x), list(mnk))) - + "}", - kernel_name=k.name, - ) - ) - elif self.istune and isinstance(mnk, int): - f.write(LOOKUP_template.format(MNK=mnk, kernel_name=k.name)) - f.write(LOOKUP_end) + write_lookup_header( + os.path.join(self.working_path, "gemm_a8w8_lookup.h"), + kernels_dict, + LOOKUP_head, + LOOKUP_template, + LOOKUP_end, + self.istune, + ) def gen_manifest_head(self, kernels_dict): MAINFEST_head = """#pragma once @@ -297,21 +305,11 @@ def gen_instances(self, kernels_dict): def get_tune_dict(tune_dict_csv): - tune_dict = default_kernels_dict if os.path.exists(tune_dict_csv): - tune_df = pd.read_csv(tune_dict_csv) - if torch.cuda.is_available(): - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = device_properties.multi_processor_count - tune_df = tune_df[tune_df["cu_num"] == cu_num].reset_index() - for i in range(len(tune_df)): - M = tune_df.loc[i, "M"] - N = tune_df.loc[i, "N"] - K = tune_df.loc[i, "K"] - kid = tune_df.loc[i, "kernelId"] - tune_dict[(M, N, K)] = kernels_list[kid] - return tune_dict + return build_tune_dict( + pd.read_csv(tune_dict_csv), default_kernels_dict, kernels_list + ) + return default_kernels_dict if __name__ == "__main__": diff --git a/csrc/ck_gemm_a8w8_blockscale/README.md b/csrc/ck_gemm_a8w8_blockscale/README.md index a514ee7f43..673d60ac54 100755 --- a/csrc/ck_gemm_a8w8_blockscale/README.md +++ b/csrc/ck_gemm_a8w8_blockscale/README.md @@ -13,11 +13,11 @@ Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_blockscale_tune via jit: `python3 csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py -i aiter/configs/a8w8_blockscale_untuned_gemm.csv -o aiter/configs/a8w8_blockscale_tuned_gemm.csv --libtype both` libtype can be `ck`, `cktile` or `both`. We recommend to tune together by setting `--libtype both` to get both ck legacy and tile implementations, then choose the best one, this will take more time but help to get better performance. You can find the results of the tuning in `aiter/configs/a8w8_blockscale_tuned_gemm.csv`, like this: - |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| - |----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| - |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | + |**gfx** |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |---------|----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| + |gfx942 |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | - `cu_num` means the number of compute units, and it is used to distinguish between graphics. + `gfx` identifies the GPU architecture (e.g. `gfx942`, `gfx950`). `cu_num` is the number of compute units and distinguishes partitioned or binned variants of the same architecture (e.g. MI308X vs MI300X both use `gfx942`). 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_gemm_a8w8_blockscale.py` and run it, please wait a few minutes as it will build gemm_a8w8_blockscale tuned kernels in `aiter/configs/a8w8_blockscale_tuned_gemm.csv` via jit: diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale.cu b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale.cu index daecea9ebe..6d99612be2 100644 --- a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale.cu +++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale.cu @@ -8,6 +8,7 @@ #include #include "gemm_common.h" +#include "gemm_dispatch_utils.h" #include "gemm_a8w8_blockscale_common.cuh" #include "gemm_a8w8_blockscale_lookup.h" @@ -16,20 +17,7 @@ using BlockwiseKernel = std::function; -// Define a custom hash function for std::tuple -struct IntTupleHash -{ - size_t operator()(const std::tuple& t) const - { - auto hash1 = std::hash{}(std::get<0>(t)); - auto hash2 = std::hash{}(std::get<1>(t)); - auto hash3 = std::hash{}(std::get<2>(t)); - return hash1 ^ hash2 ^ hash3; - } -}; - -using BlockwiseKernelMap = - std::unordered_map, BlockwiseKernel, IntTupleHash>; +using BlockwiseKernelMap = GemmDispatchMap; template static BlockwiseKernel blockscale_dispatch(int M, int N, int K) @@ -53,8 +41,11 @@ static BlockwiseKernel blockscale_dispatch(int M, int N, int K) } }(); + const int cu_num = get_device_cu_num(); + const std::string& gfx = get_device_gfx(); + // First check if this shape(M,N,K) is available in the direct lookup. - auto it = lookup.find({M, N, K}); + auto it = lookup.find({gfx, cu_num, M, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { @@ -67,7 +58,7 @@ static BlockwiseKernel blockscale_dispatch(int M, int N, int K) padded_m = getPaddedM(M, N, K, 0); // Second check if this shape(padded_m,N,K) is available in the direct lookup. - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { @@ -76,7 +67,7 @@ static BlockwiseKernel blockscale_dispatch(int M, int N, int K) // Coarse-grained search padded_m = getPaddedM(M, N, K, 1); - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); if(it != lookup.end()) { return it->second; diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile.cu b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile.cu index 1a809a6bf4..d5cdf0d239 100644 --- a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile.cu +++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile.cu @@ -8,6 +8,7 @@ #include #include "gemm_common.h" +#include "gemm_dispatch_utils.h" #include "gemm_a8w8_blockscale_cktile_common.cuh" #include "gemm_a8w8_blockscale_cktile_lookup.h" @@ -16,20 +17,7 @@ using BlockwiseKernel = std::function; -// Define a custom hash function for std::tuple -struct IntTupleHash -{ - size_t operator()(const std::tuple& t) const - { - auto hash1 = std::hash{}(std::get<0>(t)); - auto hash2 = std::hash{}(std::get<1>(t)); - auto hash3 = std::hash{}(std::get<2>(t)); - return hash1 ^ hash2 ^ hash3; - } -}; - -using BlockwiseKernelMap = - std::unordered_map, BlockwiseKernel, IntTupleHash>; +using BlockwiseKernelMap = GemmDispatchMap; template static BlockwiseKernel blockscale_dispatch(int M, int N, int K) @@ -53,8 +41,11 @@ static BlockwiseKernel blockscale_dispatch(int M, int N, int K) } }(); + const int cu_num = get_device_cu_num(); + const std::string& gfx = get_device_gfx(); + // First check if this shape(M,N,K) is available in the direct lookup. - auto it = lookup.find({M, N, K}); + auto it = lookup.find({gfx, cu_num, M, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { @@ -67,7 +58,7 @@ static BlockwiseKernel blockscale_dispatch(int M, int N, int K) padded_m = getPaddedM(M, N, K, 0); // Second check if this shape(padded_m,N,K) is available in the direct lookup. - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { @@ -76,7 +67,7 @@ static BlockwiseKernel blockscale_dispatch(int M, int N, int K) // Coarse-grained search padded_m = getPaddedM(M, N, K, 1); - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); if(it != lookup.end()) { return it->second; diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py index 8244c36a6c..41ada5c7df 100644 --- a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py +++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py @@ -15,7 +15,7 @@ from aiter.utility.base_tuner import GemmCommonTuner from aiter.utility.mp_tuner import mp_tuner from aiter.ops.shuffle import shuffle_weight -from aiter.jit.utils.chip_info import get_gfx +from aiter.jit.utils.chip_info import get_gfx_runtime as get_gfx sys.path.insert(0, str(Path(__file__).parent.parent)) from ck_gemm_a8w8_blockscale_bpreshuffle.gemm_a8w8_blockscale_bpreshuffle_common import ( @@ -183,11 +183,11 @@ def __init__(self, name, keys, resultList, description=""): super().__init__(name, keys, resultList, description) def _clear_op_caches(self): - from aiter.ops.gemm_op_a8w8 import get_GEMM_config_with_quant_type + from aiter.ops import gemm_op_a8w8 as _op - get_GEMM_config_with_quant_type.cache_clear() - if hasattr(get_GEMM_config_with_quant_type, "file_cache"): - get_GEMM_config_with_quant_type.file_cache.clear() + _op.get_CKGEMM_config.cache_clear() + _op._CKGEMM_CONFIG_CACHE.clear() + _op._CKGEMM_HAS_GFX.clear() def _setup_specific_arguments(self): """ @@ -274,7 +274,7 @@ def get_gemm_a8w8_blockscale_cktile_tune_task( block_per_cu, run_kwargs, ): - cu_num, M, N, K = info_keys + gfx, cu_num, M, N, K = info_keys # kernel_list = candidate_kernels_bpreshuffle_cktile_dict if preshuffleB else candidate_kernels_cktile_dict kernel_list = { k: v @@ -341,7 +341,7 @@ def get_gemm_a8w8_blockscale_tune_task( preshuffleB, run_kwargs, ): - cu_num, M, N, K = info_keys + gfx, cu_num, M, N, K = info_keys kernel_list = ( candidate_kernels_bpreshuffle_dict if preshuffleB @@ -453,7 +453,7 @@ def get_gemm_a8w8_blockscale_asm_tune_task( preshuffleB, run_kwargs, ): - cu_num, M, N, K = info_keys + gfx, cu_num, M, N, K = info_keys asm_kernel_list_csv = ( f"{get_asm_dir()}/fp8gemm_blockscale/fp8gemm_bf16_blockscale.csv" ) @@ -526,6 +526,7 @@ def tune( errRatio = args.errRatio block_per_cu = args.blockPerCu cu_num = self.get_cu_num() + gfx = self.get_gfx() run_kwargs = { "num_warmup": args.warmup, "num_iters": args.iters, @@ -539,7 +540,7 @@ def tune( K = untunedf.loc[i, "K"] seed = seed + 1 prev_task_count = len(task) - info_keys = (cu_num, M, N, K) + info_keys = (gfx, cu_num, M, N, K) lib = args.libtype if lib in ("ck", "both", "all"): task.extend( @@ -637,7 +638,7 @@ def result_to_df(self, results): if __name__ == "__main__": - key = ["cu_num", "M", "N", "K"] + key = ["gfx", "cu_num", "M", "N", "K"] resultList = [ "libtype", "kernelId", diff --git a/csrc/ck_gemm_a8w8_blockscale/gen_instances.py b/csrc/ck_gemm_a8w8_blockscale/gen_instances.py index 134d1313bd..c538659f96 100644 --- a/csrc/ck_gemm_a8w8_blockscale/gen_instances.py +++ b/csrc/ck_gemm_a8w8_blockscale/gen_instances.py @@ -2,13 +2,24 @@ # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. import argparse import os +import sys import shutil from pathlib import Path import pandas as pd -import torch -from gemm_a8w8_blockscale_instance import ( +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_CORE_DIR = ( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter/jit/utils") + if os.path.exists( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter_meta") + ) + else os.path.abspath(f"{this_dir}/../../aiter/jit/utils") +) +sys.path.insert(0, AITER_CORE_DIR) +from chip_info import build_tune_dict, write_lookup_header # noqa: E402 + +from gemm_a8w8_blockscale_instance import ( # noqa: E402 default_kernels_dict, KernelInstance, candidate_kernels_dict, @@ -34,33 +45,14 @@ def get_tune_dict(self, tune_dict_csv: str): """ Get tune dict from csv file """ - - tune_dict = default_kernels_dict - if os.path.exists(tune_dict_csv): - tune_df = pd.read_csv(tune_dict_csv) - if torch.cuda.is_available(): - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = device_properties.multi_processor_count - tune_df = tune_df[ - (tune_df["cu_num"] == cu_num) & (tune_df["libtype"] == "ck") - ].reset_index() - - for i in range(len(tune_df)): - M = int(tune_df.loc[i, "M"]) - N = int(tune_df.loc[i, "N"]) - K = int(tune_df.loc[i, "K"]) - kid = int(tune_df.loc[i, "kernelId"]) - - if kid in candidate_kernels_dict: - tune_dict[(M, N, K)] = candidate_kernels_dict[kid] - else: - print( - f"Warning: kernelId {kid} not found in candidate_kernels_dict for shape ({M}, {N}, {K})" - ) - - return tune_dict + return build_tune_dict( + pd.read_csv(tune_dict_csv), + default_kernels_dict, + candidate_kernels_dict, + libtype="ck", + ) + return default_kernels_dict def gen_ck_instance(self, k: KernelInstance): """ @@ -288,24 +280,14 @@ def gen_lookup_dict(self, kernels_dict): #endif // USE_ROCM """ - with open( - os.path.join(self.working_path, "gemm_a8w8_blockscale_lookup.h"), "w" - ) as f: - f.write(LOOKUP_head) - for mnk, k in kernels_dict.items(): - # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) - if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): - f.write( - LOOKUP_template.format( - MNK="{" - + (", ").join(map(lambda x: str(x), list(mnk))) - + "}", - kernel_name=k.name, - ) - ) - elif self.istune and isinstance(mnk, int): - f.write(LOOKUP_template.format(MNK=mnk, kernel_name=k.name)) - f.write(LOOKUP_end) + write_lookup_header( + os.path.join(self.working_path, "gemm_a8w8_blockscale_lookup.h"), + kernels_dict, + LOOKUP_head, + LOOKUP_template, + LOOKUP_end, + self.istune, + ) def gen_manifest_head(self, kernels_dict): """ diff --git a/csrc/ck_gemm_a8w8_blockscale/gen_instances_cktile.py b/csrc/ck_gemm_a8w8_blockscale/gen_instances_cktile.py index f54046a677..29087a2377 100644 --- a/csrc/ck_gemm_a8w8_blockscale/gen_instances_cktile.py +++ b/csrc/ck_gemm_a8w8_blockscale/gen_instances_cktile.py @@ -2,13 +2,24 @@ # Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. import argparse import os +import sys import shutil from pathlib import Path import pandas as pd -import torch -from gemm_a8w8_blockscale_cktile_instance import ( +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_CORE_DIR = ( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter/jit/utils") + if os.path.exists( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter_meta") + ) + else os.path.abspath(f"{this_dir}/../../aiter/jit/utils") +) +sys.path.insert(0, AITER_CORE_DIR) +from chip_info import build_tune_dict, write_lookup_header # noqa: E402 + +from gemm_a8w8_blockscale_cktile_instance import ( # noqa: E402 default_kernels_cktile_dict, TileKernelInstance, candidate_kernels_cktile_dict, @@ -35,51 +46,15 @@ def get_tune_dict(self, tune_dict_csv: str): """ Get tune dict from csv file """ - - tune_dict = default_kernels_cktile_dict - if os.path.exists(tune_dict_csv): - tune_df = pd.read_csv(tune_dict_csv) - if torch.cuda.is_available(): - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = device_properties.multi_processor_count - tune_df = tune_df[ - (tune_df["cu_num"] == cu_num) & (tune_df["libtype"] == "cktile") - ].reset_index() - # NOTE: Matching by kernelName (not kernelId). The kernelId column in tuned - # CSVs is kept but it is NOT used for kernel selection anymore. - # This allows instance lists to be reordered or expanded (e.g. changing - # BLOCK_PER_CU_MAX) without invalidating existing tuned CSVs. - use_name = "kernelName" in tune_df.columns - if not use_name: - print( - "[Warning]: tuned CSV has no kernelName column, falling back to kernelId. " - "Re-run tuner to generate a CSV with kernelName for robust matching." - ) - for i in range(len(tune_df)): - M = int(tune_df.loc[i, "M"]) - N = int(tune_df.loc[i, "N"]) - K = int(tune_df.loc[i, "K"]) - - if use_name: - kname = str(tune_df.loc[i, "kernelName"]) - if kname in candidate_kernels_by_name: - tune_dict[(M, N, K)] = candidate_kernels_by_name[kname] - else: - print( - f"Warning: kernelName '{kname}' not found for shape ({M}, {N}, {K})" - ) - else: - kid = int(tune_df.loc[i, "kernelId"]) - if kid in candidate_kernels_cktile_dict: - tune_dict[(M, N, K)] = candidate_kernels_cktile_dict[kid] - else: - print( - f"Warning: kernelId {kid} not found for shape ({M}, {N}, {K})" - ) - - return tune_dict + return build_tune_dict( + pd.read_csv(tune_dict_csv), + default_kernels_cktile_dict, + candidate_kernels_cktile_dict, + libtype="cktile", + kernels_by_name=candidate_kernels_by_name, + ) + return default_kernels_cktile_dict def gen_cktile_instance(self, k: TileKernelInstance): """ @@ -189,24 +164,14 @@ def gen_lookup_dict(self, kernels_dict: dict): #endif // USE_ROCM """ - with open( - os.path.join(self.working_path, "gemm_a8w8_blockscale_cktile_lookup.h"), "w" - ) as f: - f.write(LOOKUP_head) - for mnk, k in kernels_dict.items(): - # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) - if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): - f.write( - LOOKUP_template.format( - MNK="{" - + (", ").join(map(lambda x: str(x), list(mnk))) - + "}", - kernel_name=k.name, - ) - ) - elif self.istune and isinstance(mnk, int): - f.write(LOOKUP_template.format(MNK=mnk, kernel_name=k.name)) - f.write(LOOKUP_end) + write_lookup_header( + os.path.join(self.working_path, "gemm_a8w8_blockscale_cktile_lookup.h"), + kernels_dict, + LOOKUP_head, + LOOKUP_template, + LOOKUP_end, + self.istune, + ) def gen_manifest_head(self, kernels_dict): """ diff --git a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md index 4d4c6c183a..1838c075e3 100755 --- a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md +++ b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md @@ -13,11 +13,11 @@ Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_blockscale_bpreshuffle_tune via jit: `python3 csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py --preshuffle -i aiter/configs/a8w8_blockscale_bpreshuffle_untuned_gemm.csv -o aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv` You can find the results of the tuning in `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv`, like this: - |**cu_num**|**M**|**N**|**K**|**libtype**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| - |----------|-----|-----|-----|-----------|------------|----------|------|--------------|----------|------|------------| - |80 |128 |1536 |7168 |ck |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | + |**gfx** |**cu_num**|**M**|**N**|**K**|**libtype**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |---------|----------|-----|-----|-----|-----------|------------|----------|------|--------------|----------|------|------------| + |gfx942 |80 |128 |1536 |7168 |ck |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | - `cu_num` means the number of compute units, and it is used to distinguish between graphics. + `gfx` identifies the GPU architecture (e.g. `gfx942`, `gfx950`). `cu_num` is the number of compute units and distinguishes partitioned or binned variants of the same architecture (e.g. MI308X vs MI300X both use `gfx942`). 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_gemm_a8w8_blockscale.py` and run it, please wait a few minutes as it will build gemm_a8w8_blockscale_bpreshuffle tuned kernels in `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv` via jit: diff --git a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gemm_a8w8_blockscale_bpreshuffle.cu b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gemm_a8w8_blockscale_bpreshuffle.cu index 15f751dcea..9f58be7343 100755 --- a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gemm_a8w8_blockscale_bpreshuffle.cu +++ b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gemm_a8w8_blockscale_bpreshuffle.cu @@ -5,26 +5,14 @@ #include "gemm_a8w8_blockscale_bpreshuffle_lookup.h" #include "gemm_common.h" #include "gemm_a8w8_blockscale_bpreshuffle_manifest.h" +#include "gemm_dispatch_utils.h" #include using BlockwiseKernel = std::function; -// Define a custom hash function for std::tuple -struct IntTupleHash -{ - size_t operator()(const std::tuple& t) const - { - auto hash1 = std::hash{}(std::get<0>(t)); - auto hash2 = std::hash{}(std::get<1>(t)); - auto hash3 = std::hash{}(std::get<2>(t)); - return hash1 ^ hash2 ^ hash3; - } -}; - -using BlockwiseKernelMap = - std::unordered_map, BlockwiseKernel, IntTupleHash>; +using BlockwiseKernelMap = GemmDispatchMap; // Helper function to return the next largest power of 2 static constexpr int nextPow2(unsigned int num) @@ -56,8 +44,11 @@ BlockwiseKernel blockscale_bpreshuffle_dispatch(int M, int N, int K) } }(); + const int cu_num = get_device_cu_num(); + const std::string& gfx = get_device_gfx(); + // First check if this shape(M,N,K) is available in the direct lookup. - auto it = lookup.find({M, N, K}); + auto it = lookup.find({gfx, cu_num, M, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { @@ -65,24 +56,24 @@ BlockwiseKernel blockscale_bpreshuffle_dispatch(int M, int N, int K) } int padded_m = M; - + // Fine-grained search padded_m = getPaddedM(M, N, K, 0); // Second check if this shape(padded_m,N,K) is available in the direct lookup. - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { return it->second; } - + // Coarse-grained search padded_m = getPaddedM(M, N, K, 1); - it = lookup.find({padded_m, N, K}); - if (it != lookup.end()) + it = lookup.find({gfx, cu_num, padded_m, N, K}); + if(it != lookup.end()) { - return it->second; + return it->second; } // Otherwise, use heuristics. diff --git a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gen_instances.py b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gen_instances.py index 1576048ca1..0f4788a3c2 100755 --- a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gen_instances.py +++ b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gen_instances.py @@ -2,12 +2,24 @@ # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. import argparse import os +import sys import shutil from pathlib import Path import pandas as pd -import torch -from gemm_a8w8_blockscale_bpreshuffle_common import ( + +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_CORE_DIR = ( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter/jit/utils") + if os.path.exists( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter_meta") + ) + else os.path.abspath(f"{this_dir}/../../aiter/jit/utils") +) +sys.path.insert(0, AITER_CORE_DIR) +from chip_info import build_tune_dict, write_lookup_header # noqa: E402 + +from gemm_a8w8_blockscale_bpreshuffle_common import ( # noqa: E402 default_kernels_dict, kernelInstance, kernels_list, @@ -159,27 +171,16 @@ def gen_lookup_dict(self, kernels_dict): #endif // USE_ROCM """ - with open( + write_lookup_header( os.path.join( self.working_path, "gemm_a8w8_blockscale_bpreshuffle_lookup.h" ), - "w", - ) as f: - f.write(LOOKUP_head) - for mnk, k in kernels_dict.items(): - # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) - if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): - f.write( - LOOKUP_template.format( - MNK="{" - + (", ").join(map(lambda x: str(x), list(mnk))) - + "}", - kernel_name=k.name, - ) - ) - elif self.istune and isinstance(mnk, int): - f.write(LOOKUP_template.format(MNK=mnk, kernel_name=k.name)) - f.write(LOOKUP_end) + kernels_dict, + LOOKUP_head, + LOOKUP_template, + LOOKUP_end, + self.istune, + ) def gen_manifest_head(self, kernels_dict): MAINFEST_head = """#pragma once @@ -234,29 +235,11 @@ def gen_instances(self, kernels_dict): def get_tune_dict(tune_dict_csv): - tune_dict = default_kernels_dict if os.path.exists(tune_dict_csv): - tune_df = pd.read_csv(tune_dict_csv) - if torch.cuda.is_available(): - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = device_properties.multi_processor_count - tune_df = tune_df[tune_df["cu_num"] == cu_num].reset_index() - if "libtype" in tune_df.columns: - tune_df = tune_df[tune_df["libtype"] == "ck"].reset_index(drop=True) - for i in range(len(tune_df)): - M = tune_df.loc[i, "M"] - N = tune_df.loc[i, "N"] - K = tune_df.loc[i, "K"] - kid = tune_df.loc[i, "kernelId"] - if kid not in kernels_list: - print( - f"[Warning]: kernelId {kid} not found in kernels_list " - f"for shape ({M}, {N}, {K}), skip it" - ) - continue - tune_dict[(M, N, K)] = kernels_list[kid] - return tune_dict + return build_tune_dict( + pd.read_csv(tune_dict_csv), default_kernels_dict, kernels_list, libtype="ck" + ) + return default_kernels_dict if __name__ == "__main__": diff --git a/csrc/ck_gemm_a8w8_bpreshuffle/README.md b/csrc/ck_gemm_a8w8_bpreshuffle/README.md index eaf19a691d..3f051ec8df 100644 --- a/csrc/ck_gemm_a8w8_bpreshuffle/README.md +++ b/csrc/ck_gemm_a8w8_bpreshuffle/README.md @@ -14,11 +14,11 @@ Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_bpreshuffle_tune via jit: `python3 csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py -i aiter/configs/a8w8_bpreshuffle_untuned_gemm.csv -o aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv` You can find the results of this tuning in `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv`, like this: - |**cu_num**|**M**|**N**|**K**|**q_dtype_w** |**libtype**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| - |----------|-----|-----|-----|---------------------|-----------|------------|----------|------|--------------|----------|------|------------| - |80 |128 |1536 |7168 |torch.float8_e4m3fnuz| ck | 23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | + |**gfx** |**cu_num**|**M**|**N**|**K**|**q_dtype_w** |**libtype**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |---------|----------|-----|-----|-----|---------------------|-----------|------------|----------|------|--------------|----------|------|------------| + |gfx942 |80 |128 |1536 |7168 |torch.float8_e4m3fnuz| ck | 23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | - `cu_num` means the number of compute units, and it is used to distinguish between graphics. + `gfx` identifies the GPU architecture (e.g. `gfx942`, `gfx950`). `cu_num` is the number of compute units and distinguishes partitioned or binned variants of the same architecture (e.g. MI308X vs MI300X both use `gfx942`). `q_dtype_w` means the quantization data type of weight, and it is used to distinguish between different quantization data types. support torch.int8 and fp8 4. Build tuned kernels and test: diff --git a/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle.cu b/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle.cu index ac38668d55..54e72cbc41 100755 --- a/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle.cu +++ b/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle.cu @@ -5,24 +5,13 @@ #include "gemm_a8w8_bpreshuffle_lookup.h" #include "gemm_a8w8_bpreshuffle_manifest.h" #include "gemm_common.h" +#include "gemm_dispatch_utils.h" #include using RowwiseKernel = std::function; -// Define a custom hash function for std::tuple -struct IntTupleHash -{ - size_t operator()(const std::tuple& t) const - { - auto hash1 = std::hash{}(std::get<0>(t)); - auto hash2 = std::hash{}(std::get<1>(t)); - auto hash3 = std::hash{}(std::get<2>(t)); - return hash1 ^ hash2 ^ hash3; - } -}; - -using RowwiseKernelMap = std::unordered_map, RowwiseKernel, IntTupleHash>; +using RowwiseKernelMap = GemmDispatchMap; template RowwiseKernel rowwise_heuristic_dispatch(int M, int N, int K) @@ -141,8 +130,11 @@ RowwiseKernel rowwise_dispatch(int M, int N, int K) } }(); + const int cu_num = get_device_cu_num(); + const std::string& gfx = get_device_gfx(); + // First check if this shape(M,N,K) is available in the direct lookup. - auto it = lookup.find({M, N, K}); + auto it = lookup.find({gfx, cu_num, M, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { @@ -150,21 +142,21 @@ RowwiseKernel rowwise_dispatch(int M, int N, int K) } int padded_m = M; - + // Fine-grained search padded_m = getPaddedM(M, N, K, 0); // Second check if this shape(padded_m,N,K) is available in the direct lookup. - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { return it->second; } - + // Coarse-grained search padded_m = getPaddedM(M, N, K, 1); // Third check if this shape(padded_m,N,K) is available in the direct lookup. - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { diff --git a/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py b/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py index 2b04308a83..a0c5be4aa1 100755 --- a/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py +++ b/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py @@ -196,11 +196,11 @@ class GemmA8W8BpreShuffleTuner(GemmCommonTuner): } def _clear_op_caches(self): - from aiter.ops.gemm_op_a8w8 import get_GEMM_config_with_quant_type + from aiter.ops import gemm_op_a8w8 as _op - get_GEMM_config_with_quant_type.cache_clear() - if hasattr(get_GEMM_config_with_quant_type, "file_cache"): - get_GEMM_config_with_quant_type.file_cache.clear() + _op.get_GEMM_config_with_quant_type.cache_clear() + _op._GEMM_QUANT_TYPE_CACHE.clear() + _op._GEMM_QUANT_TYPE_HAS_GFX.clear() def _setup_specific_arguments(self): self.parser.add_argument( @@ -261,7 +261,7 @@ def get_asm_kernels(self, file): def get_asm_gemm_i8_tasks(self, info_keys, useSplitK, kernel_id_start, seed=0): task = [] - cu_num, M, N, K, q_dtype_w = info_keys + gfx, cu_num, M, N, K, q_dtype_w = info_keys if eval(q_dtype_w) != dtypes.i8: return task asm_kernel_list_csv = f"{get_asm_dir()}/i8gemm/i8gemm_bf16_perTokenI8.csv" @@ -321,7 +321,7 @@ def get_cktile_gemm_a8w8_bpreshuffle_tune_task( useSplitK, seed, ): - cu_num, M, N, K, q_dtype_w = info_keys + gfx, cu_num, M, N, K, q_dtype_w = info_keys if eval(q_dtype_w) != dtypes.fp8: print( f"Warning: q_dtype_w only support {dtypes.fp8}, actual q_dtype_w is {q_dtype_w}!" @@ -384,7 +384,7 @@ def get_ck_gemm_a8w8_bpreshuffle_tune_task( useSplitK, seed, ): - cu_num, M, N, K, q_dtype_w = info_keys + gfx, cu_num, M, N, K, q_dtype_w = info_keys if eval(q_dtype_w) != dtypes.fp8: print( f"Warning: q_dtype_w only support {dtypes.fp8}, actual q_dtype_w is {q_dtype_w}!" @@ -522,6 +522,7 @@ def tune( shape_grouped = args.shape_grouped errRatio = args.errRatio cu_num = self.get_cu_num() + gfx = self.get_gfx() task = [] tasks_data = [] # [(kernel_nums, datas)] seed = 10000 @@ -532,7 +533,7 @@ def tune( q_dtype_w = untunedf.loc[i, "q_dtype_w"] seed = seed + 1 prev_task_count = len(task) - info_keys = (cu_num, M, N, K, q_dtype_w) + info_keys = (gfx, cu_num, M, N, K, q_dtype_w) if "all" in args.libtype or "ck" in args.libtype: task.extend( self.get_ck_gemm_a8w8_bpreshuffle_tune_task( @@ -670,7 +671,7 @@ def run_config(self, args): if __name__ == "__main__": ## use default key and resultList - key = ["cu_num", "M", "N", "K", "q_dtype_w"] + key = ["gfx", "cu_num", "M", "N", "K", "q_dtype_w"] resultList = [ "libtype", "kernelId", diff --git a/csrc/ck_gemm_a8w8_bpreshuffle/gen_instances.py b/csrc/ck_gemm_a8w8_bpreshuffle/gen_instances.py index 874a0a513d..20f1121f6b 100755 --- a/csrc/ck_gemm_a8w8_bpreshuffle/gen_instances.py +++ b/csrc/ck_gemm_a8w8_bpreshuffle/gen_instances.py @@ -7,9 +7,19 @@ from pathlib import Path import pandas as pd -import torch -from gemm_a8w8_bpreshuffle_common import ( +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_CORE_DIR = ( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter/jit/utils") + if os.path.exists( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter_meta") + ) + else os.path.abspath(f"{this_dir}/../../aiter/jit/utils") +) +sys.path.insert(0, AITER_CORE_DIR) +from chip_info import build_tune_dict, write_lookup_header # noqa: E402 + +from gemm_a8w8_bpreshuffle_common import ( # noqa: E402 default_kernels_dict, kernelInstance, kernels_list, @@ -162,24 +172,14 @@ def gen_lookup_dict(self, kernels_dict): #endif // USE_ROCM """ - with open( - os.path.join(self.working_path, "gemm_a8w8_bpreshuffle_lookup.h"), "w" - ) as f: - f.write(LOOKUP_head) - for mnk, k in kernels_dict.items(): - # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) - if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): - f.write( - LOOKUP_template.format( - MNK="{" - + (", ").join(map(lambda x: str(x), list(mnk))) - + "}", - kernel_name=k.name, - ) - ) - elif self.istune and isinstance(mnk, int): - f.write(LOOKUP_template.format(MNK=mnk, kernel_name=k.name)) - f.write(LOOKUP_end) + write_lookup_header( + os.path.join(self.working_path, "gemm_a8w8_bpreshuffle_lookup.h"), + kernels_dict, + LOOKUP_head, + LOOKUP_template, + LOOKUP_end, + self.istune, + ) def gen_manifest_head(self, kernels_dict): MAINFEST_head = """#pragma once @@ -232,25 +232,11 @@ def gen_instances(self, kernels_dict): def get_tune_dict(tune_dict_csv): - tune_dict = default_kernels_dict if os.path.exists(tune_dict_csv): - tune_df = pd.read_csv(tune_dict_csv) - if torch.cuda.is_available(): - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = device_properties.multi_processor_count - tune_df = tune_df[(tune_df["cu_num"] == cu_num)].reset_index() - tune_df = tune_df[tune_df["libtype"] == "ck"].reset_index() - for i in range(len(tune_df)): - M = tune_df.loc[i, "M"] - N = tune_df.loc[i, "N"] - K = tune_df.loc[i, "K"] - kid = tune_df.loc[i, "kernelId"] - if kid < 0 or kid >= len(kernels_list): - print(f"[Warning]: kernelId {kid} is out of range, skip it") - continue - tune_dict[(M, N, K)] = kernels_list[kid] - return tune_dict + return build_tune_dict( + pd.read_csv(tune_dict_csv), default_kernels_dict, kernels_list, libtype="ck" + ) + return default_kernels_dict if __name__ == "__main__": diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/README.md b/csrc/cktile_gemm_a8w8_bpreshuffle/README.md index b63db71a84..846dcfe7c3 100644 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/README.md +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/README.md @@ -8,7 +8,7 @@ `FLATMM_HIP_CLANG_PATH=/data/llvm-project/build/bin/ python3 csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py -i aiter/configs/a8w8_bpreshuffle_untuned_gemm.csv -o aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv --libtype cktile` If you want to use split K kernels, you can add the `-k` parameter at the end, notice that should change `bias` to `bias/(2^k)`. This will tune both ck and cktile implementations, if you want to tune cktile only, you can add the `--libtype cktile` parameter at the end. -You can find the results of the tuning in `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv`. +You can find the results of the tuning in `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv`. The output CSV includes a `gfx` column (e.g. `gfx942`, `gfx950`) as the first column, identifying the GPU architecture. `cu_num` distinguishes partitioned or binned variants of the same architecture (e.g. MI308X vs MI300X both use `gfx942`). 3. Test the performance, modify the test instance in `op_tests/test_gemm_a8w8.py` and run it, please wait a few minutes as it will build gemm_a8w8_bpreshuffle_cktile kernels in `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv` via jit: `FLATMM_HIP_CLANG_PATH=/data/llvm-project/build/bin/ python3 op_tests/test_gemm_a8w8.py` diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu index 6f43912a97..6b04a267fd 100755 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_cktile.cu @@ -5,24 +5,13 @@ #include "gemm_a8w8_bpreshuffle_cktile_lookup.h" #include "gemm_a8w8_bpreshuffle_cktile_manifest.h" #include "gemm_common.h" +#include "gemm_dispatch_utils.h" #include using RowwiseKernel = std::function; -// Define a custom hash function for std::tuple -struct IntTupleHash -{ - size_t operator()(const std::tuple& t) const - { - auto hash1 = std::hash{}(std::get<0>(t)); - auto hash2 = std::hash{}(std::get<1>(t)); - auto hash3 = std::hash{}(std::get<2>(t)); - return hash1 ^ hash2 ^ hash3; - } -}; - -using RowwiseKernelMap = std::unordered_map, RowwiseKernel, IntTupleHash>; +using RowwiseKernelMap = GemmDispatchMap; template RowwiseKernel rowwise_heuristic_dispatch(int M, int N, int K) @@ -62,8 +51,11 @@ RowwiseKernel rowwise_dispatch(int M, int N, int K) } }(); + const int cu_num = get_device_cu_num(); + const std::string& gfx = get_device_gfx(); + // First check if this shape(M,N,K) is available in the direct lookup. - auto it = lookup.find({M, N, K}); + auto it = lookup.find({gfx, cu_num, M, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { @@ -75,7 +67,7 @@ RowwiseKernel rowwise_dispatch(int M, int N, int K) // Fine-grained search padded_m = getPaddedM(M, N, K, 0); // Second check if this shape(padded_m,N,K) is available in the direct lookup. - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { @@ -85,7 +77,7 @@ RowwiseKernel rowwise_dispatch(int M, int N, int K) // Coarse-grained search padded_m = getPaddedM(M, N, K, 1); // Third check if this shape(padded_m,N,K) is available in the direct lookup. - it = lookup.find({padded_m, N, K}); + it = lookup.find({gfx, cu_num, padded_m, N, K}); // If we found an optimal kernel, use it. if(it != lookup.end()) { diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py b/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py index a4ab0001c6..334607ff52 100755 --- a/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py +++ b/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py @@ -2,14 +2,23 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. import os import sys -from dataclasses import dataclass -import copy from pathlib import Path import pandas as pd import argparse import shutil -import torch -from gemm_a8w8_bpreshuffle_cktile_common import ( + +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_CORE_DIR = ( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter/jit/utils") + if os.path.exists( + os.path.join(os.path.abspath(f"{this_dir}/../../../"), "aiter_meta") + ) + else os.path.abspath(f"{this_dir}/../../aiter/jit/utils") +) +sys.path.insert(0, AITER_CORE_DIR) +from chip_info import build_tune_dict, write_lookup_header # noqa: E402 + +from gemm_a8w8_bpreshuffle_cktile_common import ( # noqa: E402 kernelInstance, kernels_list, default_kernels_dict, @@ -157,25 +166,14 @@ def gen_lookup_dict(self, kernels_dict): #endif // USE_ROCM """ - with open( + write_lookup_header( os.path.join(self.working_path, "gemm_a8w8_bpreshuffle_cktile_lookup.h"), - "w", - ) as f: - f.write(LOOKUP_head) - for mnk, k in kernels_dict.items(): - # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) - if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): - f.write( - LOOKUP_template.format( - MNK="{" - + (", ").join(map(lambda x: str(x), list(mnk))) - + "}", - kernel_name=k.name, - ) - ) - elif self.istune and isinstance(mnk, int): - f.write(LOOKUP_template.format(MNK=mnk, kernel_name=k.name)) - f.write(LOOKUP_end) + kernels_dict, + LOOKUP_head, + LOOKUP_template, + LOOKUP_end, + self.istune, + ) def gen_manifest_head(self, kernels_dict): MAINFEST_head = """#pragma once @@ -229,41 +227,15 @@ def gen_instances(self, kernels_dict): def get_tune_dict(tune_dict_csv): - tune_dict = default_kernels_dict if os.path.exists(tune_dict_csv): - tune_df = pd.read_csv(tune_dict_csv) - if torch.cuda.is_available(): - gpu = torch.cuda.current_device() - device_properties = torch.cuda.get_device_properties(gpu) - cu_num = device_properties.multi_processor_count - tune_df = tune_df[(tune_df["cu_num"] == cu_num)].reset_index() - tune_df = tune_df[tune_df["libtype"] == "cktile"].reset_index() - # NOTE: Matching by kernelName (not kernelId). The kernelId column in tuned - # CSVs is kept but it is NOT used for kernel selection anymore. - # This allows instance lists to be reordered or expanded (e.g. changing - # BLOCK_PER_CU_MAX) without invalidating existing tuned CSVs. - use_name = "kernelName" in tune_df.columns - if not use_name: - print( - "[Warning]: tuned CSV has no kernelName column, falling back to kernelId. " - ) - for i in range(len(tune_df)): - M = tune_df.loc[i, "M"] - N = tune_df.loc[i, "N"] - K = tune_df.loc[i, "K"] - if use_name: - kname = str(tune_df.loc[i, "kernelName"]) - if kname in kernels_by_name: - tune_dict[(M, N, K)] = kernels_by_name[kname] - else: - print(f"[Warning]: kernelName '{kname}' not found, skip it") - else: - kid = tune_df.loc[i, "kernelId"] - if kid < 0 or kid >= len(kernels_list): - print(f"[Warning]: kernelId {kid} is out of range, skip it") - continue - tune_dict[(M, N, K)] = kernels_list[kid] - return tune_dict + return build_tune_dict( + pd.read_csv(tune_dict_csv), + default_kernels_dict, + kernels_list, + libtype="cktile", + kernels_by_name=kernels_by_name, + ) + return default_kernels_dict if __name__ == "__main__": diff --git a/csrc/include/gemm_dispatch_utils.h b/csrc/include/gemm_dispatch_utils.h new file mode 100644 index 0000000000..f5289844a5 --- /dev/null +++ b/csrc/include/gemm_dispatch_utils.h @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: MIT +// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +#ifdef USE_ROCM + +#include "aiter_hip_common.h" +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// GemmDispatchHash +// +// Hash for the (gfx, cu_num, M, N, K) 5-tuple used as the C++ runtime +// dispatch key in all CK GEMM modules. The gfx arch string (e.g. "gfx942") +// is included so that multi-arch .so files containing kernels for two +// architectures that share the same cu_num do not collide. Uses boost-style +// mixing with the golden-ratio constant (0x9e3779b9) for a non-commutative, +// low-collision hash. +// --------------------------------------------------------------------------- +struct GemmDispatchHash +{ + size_t operator()(const std::tuple& t) const + { + size_t h = std::hash{}(std::get<0>(t)); + h ^= std::hash{}(std::get<1>(t)) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(std::get<2>(t)) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(std::get<3>(t)) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(std::get<4>(t)) + 0x9e3779b9 + (h << 6) + (h >> 2); + return h; + } +}; + +// --------------------------------------------------------------------------- +// get_device_cu_num +// +// Returns the multiProcessorCount of the current HIP device. Cached per +// device ID via SynchronizedCache so that processes calling hipSetDevice() +// across GPUs with different CU counts always get the correct value. +// --------------------------------------------------------------------------- +inline int get_device_cu_num() +{ + static SynchronizedCache cache; + int device = -1; + HIP_CALL(hipGetDevice(&device)); + return cache.get_or_create(device, [device]() { + hipDeviceProp_t prop{}; + HIP_CALL(hipGetDeviceProperties(&prop, device)); + return prop.multiProcessorCount; + }); +} + +// --------------------------------------------------------------------------- +// get_device_gfx +// +// Returns the GCN arch name of the current HIP device (e.g. "gfx942"). +// Cached per device ID via SynchronizedCache so that processes calling +// hipSetDevice() across GPUs of different architectures always get the +// correct arch string. Strips any :sramecc+:xnack- suffix from gcnArchName. +// --------------------------------------------------------------------------- +inline const std::string& get_device_gfx() +{ + static SynchronizedCache cache; + int device = -1; + HIP_CALL(hipGetDevice(&device)); + return cache.get_or_create(device, [device]() { + hipDeviceProp_t prop{}; + HIP_CALL(hipGetDeviceProperties(&prop, device)); + std::string arch_full = prop.gcnArchName; + size_t colon_pos = arch_full.find(':'); + return colon_pos != std::string::npos ? arch_full.substr(0, colon_pos) : arch_full; + }); +} + +// --------------------------------------------------------------------------- +// GemmDispatchMap +// +// Convenience alias for the (gfx, cu_num, M, N, K)-keyed dispatch map type. +// Each module instantiates this with its own RowwiseKernel / BlockwiseKernel +// function type: +// +// using RowwiseKernelMap = GemmDispatchMap; +// --------------------------------------------------------------------------- +template +using GemmDispatchMap = + std::unordered_map, KernelFn, GemmDispatchHash>; + +// --------------------------------------------------------------------------- +// BatchedGemmDispatchHash +// +// Hash for the (gfx, cu_num, B, M, N, K) 6-tuple used as the C++ runtime +// dispatch key in batched CK GEMM modules. Same boost-style mixing as +// GemmDispatchHash. +// --------------------------------------------------------------------------- +struct BatchedGemmDispatchHash +{ + size_t operator()(const std::tuple& t) const + { + size_t h = std::hash{}(std::get<0>(t)); + h ^= std::hash{}(std::get<1>(t)) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(std::get<2>(t)) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(std::get<3>(t)) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(std::get<4>(t)) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(std::get<5>(t)) + 0x9e3779b9 + (h << 6) + (h >> 2); + return h; + } +}; + +// --------------------------------------------------------------------------- +// BatchedGemmDispatchMap +// +// Convenience alias for the (gfx, cu_num, B, M, N, K)-keyed dispatch map type. +// Used by batched GEMM modules: +// +// using BatchedRowwiseKernelMap = BatchedGemmDispatchMap; +// --------------------------------------------------------------------------- +template +using BatchedGemmDispatchMap = + std::unordered_map, + KernelFn, + BatchedGemmDispatchHash>; + +#endif // USE_ROCM diff --git a/gradlib/gradlib/GemmTuner.py b/gradlib/gradlib/GemmTuner.py index 160695ee65..681f07d72a 100644 --- a/gradlib/gradlib/GemmTuner.py +++ b/gradlib/gradlib/GemmTuner.py @@ -870,6 +870,7 @@ def _setup_specific_arguments(self): def __init__( self, key=[ + "gfx", "cu_num", "M", "N", @@ -901,6 +902,7 @@ def __init__( self.hipb_prefer_ratio = 0.995 self.cu_num = self.get_cu_num() + self.gfx = self.get_gfx() self.gemmobj = None self.num_warmup = 10 @@ -987,7 +989,7 @@ def calculate_perf( info, time, err_ratio = results if time <= 0: return -1, -1 - cu_num, m, n, k = info + gfx, cu_num, m, n, k = info flops = m * n * k * 2 tflops = round(flops / (time * 1000000), 2) @@ -1035,6 +1037,7 @@ def pre_process(self, args): bpreshuffle=ds["bpreshuffle"], ) self.tunedf = self.get_tuned_gemm_list(self.get_out_file(args.tune_file)) + self.untunedf["gfx"] = self.get_gfx() self.untunedf["cu_num"] = self.get_cu_num() self.untunedf = self.untunedf[self.keys] untunedf_cols = self.untunedf.columns @@ -1066,7 +1069,8 @@ def add_gemm( print(self.tunedf) if self.tunedf is None or ( self.tunedf[ - (self.tunedf["cu_num"] == self.cu_num) + (self.tunedf["gfx"] == self.gfx) + & (self.tunedf["cu_num"] == self.cu_num) & (self.tunedf["M"] == m) & (self.tunedf["N"] == n) & (self.tunedf["K"] == k) @@ -1077,6 +1081,7 @@ def add_gemm( ].empty ): entry = { + "gfx": [self.gfx], "cu_num": [self.cu_num], "M": [m], "N": [n], @@ -1103,7 +1108,9 @@ def tune(self, untunedf, tunedf, args): indtype = ds["dtype"] outdtype = ds["outdtype"] outdtype = outdtype if outdtype is not None else indtype - self.set_run_iters((self.cu_num, ds["M"], ds["N"], ds["K"]), eval(indtype)) + self.set_run_iters( + (self.gfx, self.cu_num, ds["M"], ds["N"], ds["K"]), eval(indtype) + ) gemmobj = Gemm( ds["M"], @@ -1139,6 +1146,7 @@ def processResult(self, rets, fast_mode): splitK = info[2] kernelName = info[4] libtype = info[3] + res_one.append(get_gfx()) res_one.append(get_cu_num()) for ele in info[0]: res_one.append(ele) @@ -1151,7 +1159,7 @@ def processResult(self, rets, fast_mode): res_one.append(kernelName) res_one.append(err_ratio) ret = ( - (self.cu_num, info[0][0], info[0][1], info[0][2]), + (self.gfx, self.cu_num, info[0][0], info[0][1], info[0][2]), us, err_ratio, ) @@ -1246,7 +1254,7 @@ def save_profile(self, timedf, profile_file): resultsdf.to_csv(profile_file, index=False) def set_run_iters(self, input, inputdtype): - cu_num, m, n, k, *rest = input + gfx, cu_num, m, n, k, *rest = input flops = m * n * k * 2 # bpe = self.get_bpe(inputdtype) if flops < 128 * 5120 * 256 * 2: diff --git a/op_tests/configs/gemm_codegen_gfx_filter.csv b/op_tests/configs/gemm_codegen_gfx_filter.csv new file mode 100644 index 0000000000..47eaef43f2 --- /dev/null +++ b/op_tests/configs/gemm_codegen_gfx_filter.csv @@ -0,0 +1,10 @@ +gfx,cu_num,M,N,K,q_dtype_w,kernelId,splitK,us,kernelName,tflops,bw,errRatio,libtype +gfx942,80,1,8192,1024,torch.int8,78,0,9.1327,a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,1.84,920.43,0.0, +gfx942,304,1,8192,1024,torch.int8,78,0,9.1327,a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2,1.84,1021.33,0.0, +gfx950,256,1,8192,1024,torch.int8,79,0,5.7218,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,2.93,1469.12,0.0, +gfx942,80,32,8192,1024,torch.int8,28,0,9.9382,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,54.02,900.13,0.0, +gfx942,304,32,8192,1024,torch.int8,28,0,9.9382,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3,54.02,900.13,0.0, +gfx950,256,32,8192,1024,torch.int8,79,0,5.6281,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2,95.39,1589.46,0.0, +gfx942,80,128,1280,8192,torch.int8,30,0,23.3376,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,115.02,508.28,0.0, +gfx942,304,128,1280,8192,torch.int8,30,0,23.3376,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3,115.02,508.28,0.0, +gfx950,256,128,1280,8192,torch.int8,34,0,13.9827,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3,191.98,848.34,0.0, diff --git a/op_tests/configs/gemm_codegen_gfx_filter_bpreshuffle.csv b/op_tests/configs/gemm_codegen_gfx_filter_bpreshuffle.csv new file mode 100644 index 0000000000..1120c83ba2 --- /dev/null +++ b/op_tests/configs/gemm_codegen_gfx_filter_bpreshuffle.csv @@ -0,0 +1,7 @@ +gfx,cu_num,M,N,K,q_dtype_w,kernelId,splitK,us,kernelName,tflops,bw,errRatio,libtype +gfx942,304,64,5120,1280,torch.int8,0,1,7.7424,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,108.35,941.68,0.0,asm +gfx950,256,64,5120,1280,torch.float8_e4m3fn,29,0,7.6564,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,109.56,952.26,0.0,ck +gfx942,304,128,5120,1280,torch.int8,1,1,8.5797,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,195.55,935.72,0.0,asm +gfx950,256,128,5120,1280,torch.float8_e4m3fn,29,0,7.7817,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,215.60,1031.67,0.0,ck +gfx942,304,256,5120,1280,torch.int8,2,1,9.2104,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_48x128E,364.31,1031.74,0.0,asm +gfx950,256,256,5120,1280,torch.float8_e4m3fn,119,0,10.1726,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,329.85,934.15,0.0,ck diff --git a/op_tests/test_gemm_a16w16.py b/op_tests/test_gemm_a16w16.py index 9bf058a4c8..9b8cc10e74 100755 --- a/op_tests/test_gemm_a16w16.py +++ b/op_tests/test_gemm_a16w16.py @@ -11,7 +11,7 @@ import aiter from aiter import dtypes, hipb_create_extension, hipb_mm -from aiter.jit.utils.chip_info import get_gfx +from aiter.jit.utils.chip_info import get_gfx_runtime as get_gfx from aiter.ops.shuffle import shuffle_weight from aiter.test_common import benchmark, checkAllclose, perftest from aiter.tuned_gemm import tgemm, triton_gemm diff --git a/op_tests/test_gemm_a4w4.py b/op_tests/test_gemm_a4w4.py index aeaca56d35..1b42916f28 100644 --- a/op_tests/test_gemm_a4w4.py +++ b/op_tests/test_gemm_a4w4.py @@ -87,7 +87,7 @@ def run_gemm_asm( @benchmark() def test_gemm(dtype, M, N, K): - from aiter.jit.utils.chip_info import get_gfx + from aiter.jit.utils.chip_info import get_gfx_runtime as get_gfx if get_gfx() not in ["gfx950"]: return diff --git a/op_tests/test_gemm_a8w8.py b/op_tests/test_gemm_a8w8.py index fe070ef169..fa7682c9a5 100755 --- a/op_tests/test_gemm_a8w8.py +++ b/op_tests/test_gemm_a8w8.py @@ -10,7 +10,7 @@ from aiter.ops.shuffle import shuffle_weight from aiter.test_common import checkAllclose, perftest, benchmark from aiter import hipb_mm, hipb_create_extension -from aiter.jit.utils.chip_info import get_gfx, get_cu_num +from aiter.jit.utils.chip_info import get_gfx_runtime as get_gfx, get_cu_num import pandas as pd import argparse from functools import lru_cache @@ -37,11 +37,13 @@ def is_shape_tuned( if os.path.exists(tuned_file): try: df = pd.read_csv(tuned_file) + gfx = get_gfx() cu_num = get_cu_num() + mask = df["cu_num"] == cu_num + if "gfx" in df.columns: + mask = mask & (df["gfx"] == gfx) _TUNED_SHAPES_CACHE[tuned_file] = set( - df[df["cu_num"] == cu_num][["M", "N", "K", "q_dtype_w"]].apply( - tuple, axis=1 - ) + df[mask][["M", "N", "K", "q_dtype_w"]].apply(tuple, axis=1) ) except Exception as e: print(f"Warning: Could not load tuned shapes: {e}") @@ -111,7 +113,7 @@ def init_hipblas(): @benchmark() -def test_gemm(dtype, m, n, k, quantDtype=dtypes.i8, pad_a=128): +def test_gemm(dtype, m, n, k, quantDtype=dtypes.i8, pad_a=128, skip_ck=False): x = torch.randn((m, k), dtype=dtype, device="cuda") weight = torch.randn((n, k), dtype=dtype, device="cuda") x, x_scale = aiter.pertoken_quant(x, quant_dtype=quantDtype) @@ -139,21 +141,26 @@ def test_gemm(dtype, m, n, k, quantDtype=dtypes.i8, pad_a=128): # print(f"{x_pad.shape=}{x_pad.stride()}") a, avg_a = run_torch(x, weight, x_scale, w_scale, bias, dtype) - b, avg_b = run_gemm_ck(x, weight, x_scale, w_scale, bias, dtype) - - shape_is_tuned = (quantDtype == dtypes.fp8) and is_shape_tuned(m, n, k, quantDtype) - if shape_is_tuned: - err_b = checkAllclose( - a, - b, - msg="ck (tuned): ", - rtol=1e-1, - atol=1e-1, - tol_err_ratio=1.0, - printLog=False, - ) + # skip_ck bypasses gemm_a8w8_CK (module_gemm_a8w8) only; run_gemm_ck_bpreshuffle is unaffected (gated by quantDtype below) + if skip_ck: + avg_b = err_b = None else: - err_b = checkAllclose(a, b, msg="ck: ", rtol=1e-2, atol=1e-2) + b, avg_b = run_gemm_ck(x, weight, x_scale, w_scale, bias, dtype) + shape_is_tuned = (quantDtype == dtypes.fp8) and is_shape_tuned( + m, n, k, quantDtype + ) + if shape_is_tuned: + err_b = checkAllclose( + a, + b, + msg="ck (tuned): ", + rtol=1e-1, + atol=1e-1, + tol_err_ratio=1.0, + printLog=False, + ) + else: + err_b = checkAllclose(a, b, msg="ck: ", rtol=1e-2, atol=1e-2) if quantDtype != dtypes.i8: c, avg_c = run_gemm_ck_bpreshuffle(x, weightshuffle, x_scale, w_scale, dtype) # c = c + bias @@ -326,16 +333,21 @@ def calculate_total_valid_points(cu_count, aligned_k): return total -def test_normal_gemm_a8w8_pertoken_quant(l_dtype, l_quantDtype, l_mnk, pad_a=128): +def test_normal_gemm_a8w8_pertoken_quant( + l_dtype, l_quantDtype, l_mnk, pad_a=128, skip_ck=False +): df = [] for dtype in l_dtype: for quantDtype in l_quantDtype: for m, n, k in l_mnk: - ret = test_gemm(dtype, m, n, k, quantDtype, pad_a=pad_a) + ret = test_gemm( + dtype, m, n, k, quantDtype, pad_a=pad_a, skip_ck=skip_ck + ) df.append(ret) df = pd.DataFrame(df) df_md = df.to_markdown(index=False) aiter.logger.info("gemm_a8w8 summary (markdown):\n%s", df_md) + return df def test_skinny_gemm_a8w8_pertoken_quant(): @@ -472,7 +484,81 @@ def test_skinny_gemm_a8w8_pertoken_quant(): e.g. -mnk 1280,8192,1024""", ) +parser.add_argument( + "--csv", + type=str, + default=None, + help="""CSV file containing M, N, K columns (one shape per row). + e.g.: --csv shapes.csv""", +) +parser.add_argument( + "--bpreshuffle-csv", + type=str, + default=None, + dest="bpreshuffle_csv", + help="""CSV file for bpreshuffle-path shapes (skips gemm_a8w8_CK, runs ASM directly). + e.g.: --bpreshuffle-csv op_tests/configs/gemm_codegen_gfx_filter_bpreshuffle.csv""", +) +parser.add_argument( + "-o", + "--output", + type=str, + default=None, + help="""Directory to save results CSV. + e.g.: -o results/""", +) +parser.add_argument( + "--suffix", + type=str, + default="results", + help="""Suffix for output CSV filename. + e.g.: --suffix branch""", +) + args = parser.parse_args() -test_normal_gemm_a8w8_pertoken_quant(args.dtype, args.quantDtype, args.mnk, args.pad_a) +if args.csv is not None: + if not os.path.exists(args.csv): + raise FileNotFoundError(f"CSV file not found: {args.csv}") + shapes_df = pd.read_csv(args.csv) + print(f"Loaded {len(shapes_df)} shapes from {args.csv}", flush=True) + args.mnk = list( + zip(shapes_df["M"].tolist(), shapes_df["N"].tolist(), shapes_df["K"].tolist()) + ) + +df = test_normal_gemm_a8w8_pertoken_quant( + args.dtype, args.quantDtype, args.mnk, args.pad_a +) test_skinny_gemm_a8w8_pertoken_quant() + +if args.output and df is not None: + os.makedirs(args.output, exist_ok=True) + if args.csv: + csv_filename = os.path.basename(args.csv).replace(".csv", f"_{args.suffix}.csv") + else: + csv_filename = f"gemm_a8w8_{args.suffix}.csv" + out_path = os.path.join(args.output, csv_filename) + df.to_csv(out_path, index=False) + print(f"Saved results to: {out_path}") + +if args.bpreshuffle_csv is not None: + if not os.path.exists(args.bpreshuffle_csv): + raise FileNotFoundError(f"bpreshuffle CSV not found: {args.bpreshuffle_csv}") + bpre_df = pd.read_csv(args.bpreshuffle_csv) + print( + f"Loaded {len(bpre_df)} bpreshuffle shapes from {args.bpreshuffle_csv}", + flush=True, + ) + bpre_mnk = list( + zip(bpre_df["M"].tolist(), bpre_df["N"].tolist(), bpre_df["K"].tolist()) + ) + df_bpre = test_normal_gemm_a8w8_pertoken_quant( + args.dtype, args.quantDtype, bpre_mnk, args.pad_a, skip_ck=True + ) + if args.output and df_bpre is not None: + bpre_filename = os.path.basename(args.bpreshuffle_csv).replace( + ".csv", f"_{args.suffix}.csv" + ) + bpre_out = os.path.join(args.output, bpre_filename) + df_bpre.to_csv(bpre_out, index=False) + print(f"Saved bpreshuffle results to: {bpre_out}") diff --git a/op_tests/test_gemm_a8w8_blockscale.py b/op_tests/test_gemm_a8w8_blockscale.py index 6bcb908cbd..54d6fd02b2 100755 --- a/op_tests/test_gemm_a8w8_blockscale.py +++ b/op_tests/test_gemm_a8w8_blockscale.py @@ -19,9 +19,10 @@ from einops import repeat as eirp block_shape = (128, 128) +TEST_NUM_ITERS = 100 -@perftest(num_iters=5) +@perftest(num_iters=TEST_NUM_ITERS) def run_torch(x, weight, x_scale, w_scale, dtype=dtypes.bf16): block_shape_n, block_shape_k = block_shape m, k = x.shape @@ -46,12 +47,12 @@ def run_torch(x, weight, x_scale, w_scale, dtype=dtypes.bf16): return out.to(dtype) -@perftest() +@perftest(num_iters=TEST_NUM_ITERS) def run_gemm(x, weight, x_scale, w_scale, dtype=dtypes.bf16): return aiter.gemm_a8w8_blockscale(x, weight, x_scale, w_scale, dtype) -@perftest() +@perftest(num_iters=TEST_NUM_ITERS) def run_gemm_bpreshuffle(x, weightshuffle, x_scale, w_scale, dtype=dtypes.bf16): return aiter.gemm_a8w8_blockscale_bpreshuffle( x, weightshuffle, x_scale, w_scale, dtype @@ -98,7 +99,7 @@ def test_gemm(dtype, m, n, k, ck_preshuffle=True): return ret -@perftest(num_iters=5) +@perftest(num_iters=TEST_NUM_ITERS) def run_torch2(x, weight, x_scale, w_scale, dtype=dtypes.bf16): block_shape_n, block_shape_k = block_shape m, k = x.shape @@ -118,8 +119,8 @@ def run_torch2(x, weight, x_scale, w_scale, dtype=dtypes.bf16): return out.to(dtype) -@perftest() -def run_asm(x, weight, x_scale, w_scale, dtype=dtypes.bf16): +@perftest(num_iters=TEST_NUM_ITERS) +def run_asm(x, weight, x_scale, w_scale, dtype=dtypes.bf16, kernel_name=None): m, k = x.shape n, _ = weight.shape out = torch.empty((m, n), dtype=dtype, device=x.device) @@ -229,17 +230,60 @@ def run_asm(x, weight, x_scale, w_scale, dtype=dtypes.bf16): or --ck_preshuffle False """, ) +parser.add_argument( + "--csv", + type=str, + default=None, + help="""CSV file containing M, N, K columns (one shape per row). + e.g.: --csv shapes.csv""", +) +parser.add_argument( + "-o", + "--output", + type=str, + default=None, + help="""Directory to save results CSV. + e.g.: -o results/""", +) +parser.add_argument( + "--suffix", + type=str, + default="results", + help="""Suffix for output CSV filename. + e.g.: --suffix branch""", +) args = parser.parse_args() +l_preshuffle = ( + args.ck_preshuffle if isinstance(args.ck_preshuffle, list) else [args.ck_preshuffle] +) + df = [] -for dtype in args.dtype: - # deepseek-r1 - for m in args.m: - for n, k in args.nk: - for ck_p in args.ck_preshuffle: - ret = test_gemm(dtype, m, n, k, ck_preshuffle=ck_p) +if args.csv is not None: + if not os.path.exists(args.csv): + raise FileNotFoundError(f"CSV file not found: {args.csv}") + shapes_df = pd.read_csv(args.csv) + print(f"Loaded {len(shapes_df)} shapes from {args.csv}", flush=True) + for dtype in args.dtype: + for preshuffle in l_preshuffle: + for _, row in shapes_df.iterrows(): + ret = test_gemm( + dtype, + int(row["M"]), + int(row["N"]), + int(row["K"]), + ck_preshuffle=preshuffle, + ) df.append(ret) +else: + for dtype in args.dtype: + for m in args.m: + for n, k in args.nk: + for ck_p in l_preshuffle: + ret = test_gemm(dtype, m, n, k, ck_preshuffle=ck_p) + df.append(ret) + df = pd.DataFrame(df) # Configure pandas to show all columns without truncation @@ -256,3 +300,13 @@ def run_asm(x, weight, x_scale, w_scale, dtype=dtypes.bf16): df_md = df.to_markdown(index=False) aiter.logger.info("gemm_a8w8_blockscale summary (markdown):\n%s", df_md) + +if args.output: + os.makedirs(args.output, exist_ok=True) + if args.csv: + csv_filename = os.path.basename(args.csv).replace(".csv", f"_{args.suffix}.csv") + else: + csv_filename = f"gemm_a8w8_blockscale_{args.suffix}.csv" + out_path = os.path.join(args.output, csv_filename) + df.to_csv(out_path, index=False) + print(f"Saved results to: {out_path}") diff --git a/op_tests/test_gemm_codegen.py b/op_tests/test_gemm_codegen.py new file mode 100644 index 0000000000..51a1d8fa02 --- /dev/null +++ b/op_tests/test_gemm_codegen.py @@ -0,0 +1,471 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +""" +test_gemm_codegen.py — unit tests for gfx-aware GEMM build targeting and dispatch. + +Covers: + - get_build_targets() build-time target selection (chip_info.py) + - gen_instances filter: CSV row selection per (gfx, cu_num) target + - write_lookup_header: C++ key format in generated lookup headers + - Runtime dispatch key selection in gemm_op_a8w8.py et al. + +No GPU kernel execution or .so compilation required. All tests run on CPU +using only pandas and the chip_info / gemm_op_a8w8 Python layers. + +Scenarios: + 1. get_build_targets() — env-driven target selection + 2. gen_instances filter — CSV row selection per target GPU + 3. write_lookup_header — C++ key format in generated lookup header + 4. Runtime dispatch key selection — (gfx, cu_num, M, N, K) lookup + +Usage: + python op_tests/test_gemm_codegen.py + GPU_ARCHS=gfx942 python op_tests/test_gemm_codegen.py +""" + +import os +import sys +import tempfile +import textwrap + +# Ensure the repo-local aiter is imported, not any system/site-packages install. +_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, _REPO_ROOT) +# Import arch constants directly from build_targets — no torch dependency. +sys.path.insert(0, os.path.join(_REPO_ROOT, "aiter", "jit", "utils")) +from build_targets import ( # noqa: E402 + GFX_CU_NUM_MAP, + filter_tune_df, + get_build_targets_env, +) + +import pandas as pd # noqa: E402 + +REPRO_CSV = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "configs", + "gemm_codegen_gfx_filter.csv", +) +REPRO_BPRESHUFFLE_CSV = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "configs", + "gemm_codegen_gfx_filter_bpreshuffle.csv", +) + +# GPU targets used throughout this test. cu_num values match GFX_CU_NUM_MAP +# in aiter/jit/utils/build_targets.py (re-exported via chip_info.py) — update +# here if that mapping changes. +TARGET_A = ("gfx942", 304) # MI300X +TARGET_B = ("gfx950", 256) # MI350 +TARGET_C = ("gfx942", 80) # MI308X — gfx942 with CU_NUM override + +# --------------------------------------------------------------------------- +# Minimal test harness (no external test framework required) +# --------------------------------------------------------------------------- + +_passed = _failed = 0 + + +def _check(name: str, condition: bool, detail: str = "") -> None: + global _passed, _failed + if condition: + _passed += 1 + print(f" PASS {name}") + else: + _failed += 1 + msg = f" FAIL {name}" + if detail: + msg += f"\n {detail}" + print(msg) + + +def _section(title: str) -> None: + print(f"\n{'='*60}") + print(f" {title}") + print("=" * 60) + + +# --------------------------------------------------------------------------- +# Section 1: get_build_targets() +# --------------------------------------------------------------------------- + + +def test_get_build_targets(): + _section("1. get_build_targets() — env-driven target selection") + + orig_archs = os.environ.pop("GPU_ARCHS", None) + orig_cu = os.environ.pop("CU_NUM", None) + + try: + # 1.1 Single known arch + os.environ["GPU_ARCHS"] = TARGET_A[0] + t = get_build_targets_env() + _check(f"GPU_ARCHS={TARGET_A[0]} → [{TARGET_A}]", t == [TARGET_A], str(t)) + + # 1.2 CU_NUM override (MI308X: gfx942 but cu_num=80) + os.environ["GPU_ARCHS"] = TARGET_C[0] + os.environ["CU_NUM"] = str(TARGET_C[1]) + t = get_build_targets_env() + _check( + f"GPU_ARCHS={TARGET_C[0]} + CU_NUM={TARGET_C[1]} → [{TARGET_C}]", + t == [TARGET_C], + str(t), + ) + del os.environ["CU_NUM"] + + # 1.3 Second known arch + os.environ["GPU_ARCHS"] = TARGET_B[0] + t = get_build_targets_env() + _check(f"GPU_ARCHS={TARGET_B[0]} → [{TARGET_B}]", t == [TARGET_B], str(t)) + + # 1.4 Multi-arch (semicolon-separated) + os.environ["GPU_ARCHS"] = f"{TARGET_A[0]};{TARGET_B[0]}" + t = get_build_targets_env() + _check( + f"GPU_ARCHS={TARGET_A[0]};{TARGET_B[0]} → two targets", + t == [TARGET_A, TARGET_B], + str(t), + ) + + # 1.5 Unknown arch raises RuntimeError + os.environ["GPU_ARCHS"] = "gfx999" + raised = False + try: + get_build_targets_env() + except RuntimeError: + raised = True + _check("GPU_ARCHS=gfx999 → RuntimeError", raised) + + # 1.6 Separator-only GPU_ARCHS raises RuntimeError + os.environ["GPU_ARCHS"] = " ; " + raised = False + try: + get_build_targets_env() + except RuntimeError: + raised = True + _check("GPU_ARCHS=' ; ' → RuntimeError", raised) + + # 1.7 GFX_CU_NUM_MAP covers at least the two known production targets + _check( + "GFX_CU_NUM_MAP contains gfx942 and gfx950", + "gfx942" in GFX_CU_NUM_MAP and "gfx950" in GFX_CU_NUM_MAP, + ) + + # 1.8 Live GPU fallback — requires torch and a GPU; skipped otherwise + del os.environ["GPU_ARCHS"] + try: + from aiter.jit.utils.chip_info import get_build_targets + + t = get_build_targets() + _check( + "No GPU_ARCHS + live GPU → single (gfx, cu_num) pair", + len(t) == 1 and isinstance(t[0], tuple) and len(t[0]) == 2, + str(t), + ) + except (ImportError, ModuleNotFoundError): + print(" SKIP No GPU_ARCHS + live GPU (torch not available)") + except RuntimeError: + print(" SKIP No GPU_ARCHS + live GPU (no GPU detected — expected in CI)") + + finally: + if orig_archs is not None: + os.environ["GPU_ARCHS"] = orig_archs + elif "GPU_ARCHS" in os.environ: + del os.environ["GPU_ARCHS"] + if orig_cu is not None: + os.environ["CU_NUM"] = orig_cu + elif "CU_NUM" in os.environ: + del os.environ["CU_NUM"] + + +# --------------------------------------------------------------------------- +# Section 2: gen_instances filter — uses filter_tune_df from build_targets +# --------------------------------------------------------------------------- + + +def test_gen_instances_filter( + csv_path=None, target_a=TARGET_A, target_b=TARGET_B, label="" +): + """Verify gen_instances filter behaviour against a repro CSV.""" + if csv_path is None: + csv_path = REPRO_CSV + pfx = f"[{label}] " if label else "" + + _section( + f"2. gen_instances filter — CSV row selection per target{' (' + label + ')' if label else ''}" + ) + + if not os.path.exists(csv_path): + print(f" SKIP repro CSV not found: {csv_path}") + return + + df = pd.read_csv(csv_path) + gfx_a, cu_a = target_a + gfx_b, cu_b = target_b + + # 2.1 gfx column present (fix applied to CSV) + _check(f"{pfx}repro CSV has 'gfx' column", "gfx" in df.columns) + + # 2.2 Bug scenario: no filter compiles all rows (last-writer-wins) + _check( + f"{pfx}unfiltered CSV has rows for multiple gfx targets (bug: all compiled)", + df["gfx"].nunique() > 1, + f"gfx targets found: {df['gfx'].unique().tolist()}", + ) + + # 2.3 Fix: filter for target_a selects only those rows + filtered = filter_tune_df(df, [target_a]) + _check( + f"{pfx}{gfx_a}/cu_num={cu_a} filter keeps only {gfx_a} rows", + len(filtered) > 0 + and all(filtered["gfx"] == gfx_a) + and all(filtered["cu_num"] == cu_a), + f"rows={len(filtered)}, gfx={filtered['gfx'].unique().tolist()}", + ) + + # 2.4 Fix: filter for target_b selects only those rows + filtered = filter_tune_df(df, [target_b]) + _check( + f"{pfx}{gfx_b}/cu_num={cu_b} filter keeps only {gfx_b} rows", + len(filtered) > 0 + and all(filtered["gfx"] == gfx_b) + and all(filtered["cu_num"] == cu_b), + f"rows={len(filtered)}", + ) + + # 2.5 Multi-arch filter is the union of per-arch filters + n_a = len(filter_tune_df(df, [target_a])) + n_b = len(filter_tune_df(df, [target_b])) + n_multi = len(filter_tune_df(df, [target_a, target_b])) + _check( + f"{pfx}multi-arch filter row count equals sum of individual filters", + n_multi == n_a + n_b, + f"multi={n_multi}, {gfx_a}/{cu_a}={n_a}, {gfx_b}/{cu_b}={n_b}", + ) + + # 2.6 All MNK shapes in the repro CSV have different kernelIds across gfx targets + grp = df.groupby(["M", "N", "K"])["kernelId"].nunique() + shapes_with_diff = grp[grp > 1] + _check( + f"{pfx}repro CSV has shapes with different kernelIds across gfx targets", + len(shapes_with_diff) > 0, + f"shapes with diverging kernelIds: {len(shapes_with_diff)}/{len(grp)}", + ) + + # 2.7 Contamination: the two targets share MNK shapes with different kernelIds + d_a = filter_tune_df(df, [target_a]).set_index(["M", "N", "K"]) + d_b = filter_tune_df(df, [target_b]).set_index(["M", "N", "K"]) + common = d_a.index.intersection(d_b.index) + if len(common) > 0: + n_diff = sum( + d_a.loc[idx, "kernelId"] != d_b.loc[idx, "kernelId"] for idx in common + ) + _check( + f"{pfx}shared MNK shapes have different kernelIds across {gfx_a}/{cu_a} and {gfx_b}/{cu_b}", + n_diff > 0, + f"{n_diff}/{len(common)} shared shapes have diverging kernelIds", + ) + else: + print( + f" SKIP no MNK overlap between {gfx_a}/{cu_a} and {gfx_b}/{cu_b} in repro CSV" + ) + + +# --------------------------------------------------------------------------- +# Section 3: Python runtime dispatch key selection +# Tests get_CKGEMM_config() using unique temp CSV files to avoid polluting +# the module-level cache used by the real config files. +# --------------------------------------------------------------------------- + + +def _make_temp_csv(content: str) -> str: + f = tempfile.NamedTemporaryFile( + mode="w", suffix=".csv", delete=False, prefix="test_gemm_codegen_" + ) + f.write(textwrap.dedent(content).strip() + "\n") + f.close() + return f.name + + +def test_runtime_dispatch_key(): + _section("4. Runtime dispatch — (gfx, cu_num, M, N, K) lookup key") + + try: + from aiter.ops.gemm_op_a8w8 import get_CKGEMM_config + import aiter.ops.gemm_op_a8w8 as _mod + except Exception as e: + print(f" SKIP could not import get_CKGEMM_config ({e})") + return + + # get_CKGEMM_config() uses get_gfx_runtime() which always detects the live GPU + # via rocminfo — GPU_ARCHS is intentionally ignored at runtime. Derive the + # test CSV rows from the actual live GPU so the test is correct on any runner. + try: + from aiter.jit.utils.chip_info import get_gfx_runtime, get_cu_num + + gfx = get_gfx_runtime() + cu_num = get_cu_num() + except Exception as e: + print(f" SKIP runtime dispatch tests require a live GPU ({e})") + return + + # Pick a "wrong" target that is guaranteed to differ from the live GPU. + wrong_target = TARGET_B if gfx != TARGET_B[0] else TARGET_A + wrong_gfx, wrong_cu_num = wrong_target + + csv_with_gfx = wrong_gfx_csv = old_csv = None + try: + # 3.1 New CSV schema (gfx column present) — correct target is found + csv_with_gfx = _make_temp_csv(f""" + gfx,cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio + {gfx},{cu_num},128,1280,8192,42,0,10.0,correct_kernel,100.0,500.0,0.0 + {wrong_gfx},{wrong_cu_num},128,1280,8192,99,0,10.0,wrong_kernel,100.0,500.0,0.0 + """) + _mod._CKGEMM_CONFIG_CACHE = {} + cfg = get_CKGEMM_config(128, 1280, 8192, tuned_file=csv_with_gfx) + _check( + "new CSV (gfx column): shape tuned for this gfx is found", + cfg is not None, + "returned None", + ) + if cfg is not None: + _check( + "new CSV: kernelId matches this gfx target, not the other", + cfg.get("kernelId") == 42, + f"expected kernelId=42, got {cfg.get('kernelId')}", + ) + + # 3.2 Shape tuned only for a different gfx returns None on this target + wrong_gfx_csv = _make_temp_csv(f""" + gfx,cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio + {wrong_gfx},{wrong_cu_num},128,1280,8192,99,0,10.0,wrong_kernel,100.0,500.0,0.0 + """) + _mod._CKGEMM_CONFIG_CACHE = {} + cfg = get_CKGEMM_config(128, 1280, 8192, tuned_file=wrong_gfx_csv) + _check( + f"new CSV: shape tuned only for {wrong_gfx} returns None on {gfx}", + cfg is None, + f"expected None, got {cfg}", + ) + + # 3.3 Old CSV (no gfx column) falls back to cu_num-only key with a warning + old_csv = _make_temp_csv(f""" + cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio + {cu_num},128,1280,8192,7,0,10.0,old_kernel,100.0,500.0,0.0 + """) + import logging + import io + + buf = io.StringIO() + handler = logging.StreamHandler(buf) + logging.getLogger("aiter").addHandler(handler) + _mod._CKGEMM_CONFIG_CACHE = {} + cfg = get_CKGEMM_config(128, 1280, 8192, tuned_file=old_csv) + logging.getLogger("aiter").removeHandler(handler) + + _check( + "old CSV (no gfx column): shape still found via cu_num fallback", + cfg is not None and cfg.get("kernelId") == 7, + f"cfg={cfg}", + ) + _check( + "old CSV (no gfx column): deprecation warning is logged", + "gfx" in buf.getvalue().lower(), + f"log output: {buf.getvalue()!r}", + ) + + finally: + get_CKGEMM_config.cache_clear() + _mod._CKGEMM_CONFIG_CACHE = {} + _mod._CKGEMM_HAS_GFX = {} + for path in [csv_with_gfx, wrong_gfx_csv, old_csv]: + if path: + try: + os.unlink(path) + except Exception: + pass + + +def test_write_lookup_header(): + _section("3. write_lookup_header — C++ key format") + + from chip_info import write_lookup_header + + class _FakeKernel: + def __init__(self, name): + self.name = name + + kernels_dict = { + ("gfx942", 304, 128, 4096, 4096): _FakeKernel("kernel_non_batched"), + ("gfx942", 304, 2, 128, 4096, 4096): _FakeKernel("kernel_batched"), + -1: _FakeKernel("default_kernel"), # default_dict entry — must be skipped + } + + LOOKUP_head = "#ifdef USE_ROCM\n#define GENERATE_LOOKUP_TABLE(DTYPE, ETYPE) {\\\n" + LOOKUP_template = " {{{MNK}, {kernel_name}}},\\\n" + LOOKUP_end = "}\n#endif\n" + + path = None + try: + f = tempfile.NamedTemporaryFile(mode="w", suffix=".h", delete=False) + path = f.name + f.close() + write_lookup_header( + path, kernels_dict, LOOKUP_head, LOOKUP_template, LOOKUP_end + ) + content = open(path).read() + + _check( + "non-batched key: gfx string quoted in C++ initializer", + '{"gfx942", 304, 128, 4096, 4096}' in content, + f"not found in output:\n{content}", + ) + _check( + "batched key: 6-tuple with gfx string quoted", + '{"gfx942", 304, 2, 128, 4096, 4096}' in content, + f"not found in output:\n{content}", + ) + _check( + "default_dict (-1) entry is skipped", + "default_kernel" not in content, + f"default_kernel unexpectedly in output:\n{content}", + ) + _check( + "old-style key without gfx (regression guard): {304, 128, ...} absent", + "{304, 128, 4096, 4096}" not in content, + f"old-style key found in output:\n{content}", + ) + finally: + if path: + try: + os.unlink(path) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + test_get_build_targets() + test_gen_instances_filter( + csv_path=REPRO_CSV, + target_a=TARGET_C, + target_b=TARGET_B, + label="module_gemm_a8w8", + ) + test_gen_instances_filter( + csv_path=REPRO_BPRESHUFFLE_CSV, + target_a=TARGET_A, + target_b=TARGET_B, + label="module_gemm_a8w8_bpreshuffle", + ) + test_write_lookup_header() + test_runtime_dispatch_key() + + print(f"\n{'='*60}") + print(f" Results: {_passed} passed, {_failed} failed") + print("=" * 60) + sys.exit(0 if _failed == 0 else 1) diff --git a/op_tests/test_pretune.py b/op_tests/test_pretune.py new file mode 100644 index 0000000000..2fb6eb9f22 --- /dev/null +++ b/op_tests/test_pretune.py @@ -0,0 +1,371 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +""" +Unit tests for aiter/utility/pretune.py. + +No GPU or torch required — tests exercise file-system and config resolution +logic only. Run with: + python op_tests/test_pretune.py +""" + +import json +import logging +import os +import sys +import tempfile + +import pandas as pd + +REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +CSRC_DIR = os.path.join(REPO_DIR, "csrc") +CFG_PATH = os.path.join(REPO_DIR, "aiter", "jit", "optCompilerConfig.json") + +sys.path.insert(0, os.path.join(REPO_DIR, "aiter")) + +from utility.pretune import ( # noqa: E402 + _all_tune_modules, + _make_untune_csv, + _parse_module_list, + _resolve, + _SCRIPT_FALLBACK, +) + +with open(CFG_PATH) as f: + CFG = json.load(f) + +logging.getLogger("aiter").addHandler(logging.StreamHandler(sys.stdout)) + +# ── expected resolution table ──────────────────────────────────────────────── +# (tune_module, expected_script_suffix_from_repo_root, expected_config_attr, expect_skip) +EXPECTED = [ + ( + "module_batched_gemm_a8w8_tune", + "csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py", + "AITER_CONFIG_A8W8_BATCHED_GEMM_FILE", + False, + ), + ( + "module_batched_gemm_bf16_tune", + "csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py", + "AITER_CONFIG_BF16_BATCHED_GEMM_FILE", + False, + ), + ( + "module_gemm_a4w4_blockscale_tune", + "csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py", + "AITER_CONFIG_GEMM_A4W4_FILE", + False, + ), + ( + "module_gemm_a8w8_tune", + "csrc/ck_gemm_a8w8/gemm_a8w8_tune.py", + "AITER_CONFIG_GEMM_A8W8_FILE", + False, + ), + ( + "module_gemm_a8w8_blockscale_tune", + "csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py", + "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_FILE", + False, + ), + ( + # cktile variant: falls back to blockscale parent script + "module_gemm_a8w8_blockscale_cktile_tune", + "csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py", + "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_FILE", + False, + ), + ( + "module_gemm_a8w8_bpreshuffle_tune", + "csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py", + "AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE", + False, + ), + ( + # cktile variant: falls back to bpreshuffle parent script + "module_gemm_a8w8_bpreshuffle_cktile_tune", + "csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py", + "AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE", + False, + ), + ( + # no tune script writes to the bpreshuffle blockscale CSV + "module_gemm_a8w8_blockscale_bpreshuffle_tune", + None, + "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE_FILE", + True, + ), + ( + "module_gemm_a8w8_blockscale_bpreshuffle_cktile_tune", + None, + "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE_FILE", + True, + ), +] + +# AITER_CONFIGS tune CSVs with expected shape key columns after _make_untune_csv. +# FMOE and BF16 GEMM are excluded — they have no tune module. +TUNE_CONFIGS = [ + ("a8w8_blockscale_tuned_gemm.csv", ["M", "N", "K"]), + ("a8w8_tuned_gemm.csv", ["M", "N", "K"]), + ("a4w4_blockscale_tuned_gemm.csv", ["M", "N", "K"]), + ("a8w8_bpreshuffle_tuned_gemm.csv", ["M", "N", "K"]), + ("a8w8_blockscale_bpreshuffle_tuned_gemm.csv", ["M", "N", "K"]), + ("a8w8_tuned_batched_gemm.csv", ["B", "M", "N", "K"]), + ("bf16_tuned_batched_gemm.csv", ["B", "M", "N", "K"]), +] + +CONFIGS_DIR = os.path.join(REPO_DIR, "aiter", "configs") + + +# ── test helpers ────────────────────────────────────────────────────────────── + +_passed = 0 +_failed = 0 + + +def check(name: str, condition: bool, msg: str = ""): + global _passed, _failed + if condition: + print(f" [PASS] {name}") + _passed += 1 + else: + print(f" [FAIL] {name}{': ' + msg if msg else ''}") + _failed += 1 + + +# ── tests ───────────────────────────────────────────────────────────────────── + + +def test_resolve(): + """_resolve returns correct script path and config attr for every tune module.""" + print("\n=== test_resolve ===") + for module, script_suffix, config_attr, expect_skip in EXPECTED: + script, attr = _resolve(module, CFG, CSRC_DIR) + + if expect_skip: + check( + f"{module} script=None", script is None, f"expected None, got {script}" + ) + else: + expected_abs = os.path.join(REPO_DIR, script_suffix) + check( + f"{module} script path", + script == expected_abs, + f"\n got: {script}\n expected: {expected_abs}", + ) + check( + f"{module} script exists", + script is not None and os.path.exists(script), + f"not on disk: {script}", + ) + + check( + f"{module} config_attr", + attr == config_attr, + f"got {attr!r}, expected {config_attr!r}", + ) + + +def test_all_tune_modules_covered(): + """Every _tune module in optCompilerConfig.json appears in EXPECTED.""" + print("\n=== test_all_tune_modules_covered ===") + cfg_modules = set(_all_tune_modules(CFG)) + expected_modules = {e[0] for e in EXPECTED} + missing_from_expected = cfg_modules - expected_modules + extra_in_expected = expected_modules - cfg_modules + check( + "no modules missing from EXPECTED", + not missing_from_expected, + f"{missing_from_expected}", + ) + check("no extra modules in EXPECTED", not extra_in_expected, f"{extra_in_expected}") + + +def test_script_fallback_keys_in_config(): + """Every key in _SCRIPT_FALLBACK is a known tune module in optCompilerConfig.json.""" + print("\n=== test_script_fallback_keys_in_config ===") + cfg_modules = set(_all_tune_modules(CFG)) + unknown = set(_SCRIPT_FALLBACK) - cfg_modules + check( + "all _SCRIPT_FALLBACK keys are valid tune modules", + not unknown, + f"unknown: {unknown}", + ) + + +def test_make_untune_csv(): + """_make_untune_csv retains shape keys, drops metadata columns, deduplicates.""" + print("\n=== test_make_untune_csv ===") + for csv_name, expected_shape_keys in TUNE_CONFIGS: + csv_path = os.path.join(CONFIGS_DIR, csv_name) + if not os.path.exists(csv_path): + print(f" [SKIP] {csv_name}: file not present") + continue + + orig = pd.read_csv(csv_path) + present_keys = [k for k in expected_shape_keys if k in orig.columns] + expected_unique = orig[present_keys].drop_duplicates().shape[0] + + tmp = _make_untune_csv(csv_path, ["B", "M", "N", "K"]) + try: + df = pd.read_csv(tmp) + finally: + os.unlink(tmp) + + check( + f"{csv_name} columns={expected_shape_keys}", + list(df.columns) == expected_shape_keys, + f"got {df.columns.tolist()}", + ) + check( + f"{csv_name} no metadata columns leaked", + not any( + c in df.columns for c in ["gfx", "cu_num", "kernelId", "us", "libtype"] + ), + ) + check(f"{csv_name} non-empty", len(df) > 0) + check( + f"{csv_name} deduplicated ({expected_unique} unique shapes)", + len(df) == expected_unique, + f"got {len(df)}", + ) + + +def test_make_untune_csv_multi_path(): + """_make_untune_csv handles colon-separated multi-path tune_file correctly.""" + print("\n=== test_make_untune_csv_multi_path ===") + rows_a = pd.DataFrame({"M": [128, 256], "N": [512, 512], "K": [1024, 1024]}) + rows_b = pd.DataFrame( + {"M": [256, 512], "N": [512, 512], "K": [1024, 1024]} + ) # 256 overlaps + + with ( + tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as fa, + tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as fb, + ): + rows_a.to_csv(fa.name, index=False) + rows_b.to_csv(fb.name, index=False) + tune_file = f"{fa.name}{os.pathsep}{fb.name}" + + try: + tmp = _make_untune_csv(tune_file, ["B", "M", "N", "K"]) + df = pd.read_csv(tmp) + os.unlink(tmp) + # 3 unique (M,N,K): (128,512,1024), (256,512,1024), (512,512,1024) + check("multi-path: 3 unique shapes after dedup", len(df) == 3, f"got {len(df)}") + check("multi-path: columns=[M,N,K]", list(df.columns) == ["M", "N", "K"]) + finally: + os.unlink(fa.name) + os.unlink(fb.name) + + +def test_make_untune_csv_missing_raises(): + """_make_untune_csv raises FileNotFoundError when no CSV path exists.""" + print("\n=== test_make_untune_csv_missing_raises ===") + raised = False + try: + _make_untune_csv("/nonexistent/path.csv", ["M", "N", "K"]) + except FileNotFoundError: + raised = True + check("raises FileNotFoundError for missing path", raised) + + +def test_write_tune_file_resolution(): + """In standalone mode, write_tune_file must resolve to the source CSV (in aiter/configs/), + not the ephemeral /tmp merged path. + + The logic in run_pretune() strips the _FILE suffix from config_attr to derive a + module-level variable name in core (e.g. AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_FILE → + AITER_CONFIG_GEMM_A8W8_BLOCKSCALE), then calls getattr(core, source_attr). + This test verifies that all supported modules have a matching module-level variable + in core.py whose value points inside aiter/configs/ (not /tmp/). + """ + print("\n=== test_write_tune_file_resolution ===") + core_py = os.path.join(REPO_DIR, "aiter", "jit", "core.py") + with open(core_py, encoding="utf-8") as f: + core_src = f.read() + + for module, _, config_attr, expect_skip in EXPECTED: + if expect_skip or config_attr is None: + continue + source_attr = config_attr.removesuffix("_FILE") + if source_attr == config_attr: + check( + f"{module} config_attr has _FILE suffix", + False, + f"config_attr {config_attr!r} has no _FILE suffix — cannot derive source_attr", + ) + continue + # Verify the module-level variable exists in core.py + check( + f"{module} source_attr '{source_attr}' defined in core.py", + f"{source_attr} " in core_src or f"{source_attr}=" in core_src, + f"not found in core.py — getattr(core, {source_attr!r}) would return None", + ) + # Verify it maps to aiter/configs/ (not /tmp/) + import re + + m = re.search( + rf'{source_attr}\s*=\s*os\.getenv\(\s*["\'][\w]+["\'],\s*([^\)]+)\)', + core_src, + ) + if m: + default_val = m.group(1).strip().strip('"').strip("'") + check( + f"{module} default write_tune_file is in aiter/configs/", + "aiter/configs" in default_val or "aiter_meta/configs" in default_val, + f"default points to: {default_val!r}", + ) + + +def test_parse_pretune_modules(): + """PRETUNE_MODULES env values parse to the correct module lists.""" + print("\n=== test_parse_pretune_modules ===") + cases = [ + ( + "module_gemm_a8w8_blockscale_tune", + ["module_gemm_a8w8_blockscale_tune"], + ), + ( + "module_gemm_a8w8_tune,module_gemm_a8w8_blockscale_tune", + ["module_gemm_a8w8_tune", "module_gemm_a8w8_blockscale_tune"], + ), + ( + " module_gemm_a8w8_tune , module_gemm_a8w8_blockscale_tune ", + ["module_gemm_a8w8_tune", "module_gemm_a8w8_blockscale_tune"], + ), + ] + for env_value, expected in cases: + modules = _parse_module_list(env_value, CFG) + check(f"parse {env_value.strip()!r}", modules == expected, f"got {modules}") + + # "all" expands to every _tune module, excluding _unsupported entries + all_modules = _all_tune_modules(CFG) + check("all: count matches EXPECTED", len(all_modules) == len(EXPECTED)) + check("all: set matches EXPECTED", set(all_modules) == {e[0] for e in EXPECTED}) + _unsupported = {m for m, v in _SCRIPT_FALLBACK.items() if v is None} + all_parsed = _parse_module_list("all", CFG) + check( + "all: _unsupported modules excluded", + not any(m in _unsupported for m in all_parsed), + f"unsupported in result: {[m for m in all_parsed if m in _unsupported]}", + ) + + +# ── main ────────────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + test_resolve() + test_all_tune_modules_covered() + test_script_fallback_keys_in_config() + test_make_untune_csv() + test_make_untune_csv_multi_path() + test_make_untune_csv_missing_raises() + test_write_tune_file_resolution() + test_parse_pretune_modules() + + print(f"\n{'='*50}") + print(f"Results: {_passed} passed, {_failed} failed") + if _failed: + sys.exit(1) diff --git a/setup.py b/setup.py index d3ad33361f..26fb240381 100644 --- a/setup.py +++ b/setup.py @@ -10,12 +10,14 @@ from setuptools.command.build_ext import build_ext this_dir = os.path.dirname(os.path.abspath(__file__)) +OPT_COMPILER_CONFIG = os.path.join(this_dir, "aiter", "jit", "optCompilerConfig.json") PACKAGE_NAME = "amd-aiter" FLYDSL_VERSION = "flydsl==0.1.3.1" BUILD_TARGET = os.environ.get("BUILD_TARGET", "auto") PREBUILD_KERNELS = int(os.environ.get("PREBUILD_KERNELS", 0)) +PRETUNE_MODULES = os.environ.get("PRETUNE_MODULES", "") ENABLE_CK = int(os.environ.get("ENABLE_CK", "1")) IS_WINDOWS = sys.platform == "win32" if IS_WINDOWS: @@ -156,7 +158,7 @@ def _is_metadata_only(): def _load_modules_from_config(): - cfg_path = os.path.join(this_dir, "aiter", "jit", "optCompilerConfig.json") + cfg_path = OPT_COMPILER_CONFIG try: with open(cfg_path, "r", encoding="utf-8") as f: data = json.load(f) @@ -178,6 +180,7 @@ def get_exclude_ops(): for module in all_modules: if PREBUILD_KERNELS == 1: + # Exclude tune modules; for MHA keep only fmha_v3 fwd variants if "_tune" in module: exclude_ops.append(module) if "mha" in module and module not in [ @@ -315,6 +318,28 @@ def build_one_module(one_opt_args): with ThreadPoolExecutor(max_workers=prebuid_thread_num) as executor: list(executor.map(build_one_module, all_opts_args_build)) + # Retune GEMM shapes on the live GPU after the main build phase. + # Each requested module's tune script benchmarks all CSV shapes and + # writes results tagged with the live GPU's (gfx, cu_num) back to + # the source CSV, then rebuilds the inference .so. + if PRETUNE_MODULES: + # Import directly from the file to avoid triggering aiter/__init__.py, + # which would try to load module_aiter_core before it is registered. + sys.path.insert(0, os.path.join(this_dir, "aiter", "utility")) + from pretune import run_pretune_modules # noqa: E402 + + cfg_path = OPT_COMPILER_CONFIG + with open(cfg_path, "r", encoding="utf-8") as _f: + _cfg = json.load(_f) + run_pretune_modules( + PRETUNE_MODULES, + _cfg, + core, + build_one_module, + csrc_dir=f"{this_dir}/csrc", + repo_dir=this_dir, + ) + # --- FlyDSL AOT pre-compilation --- try: flydsl_cache_dir = os.path.join(this_dir, "aiter", "jit", "flydsl_cache")