diff --git a/aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv b/aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv new file mode 100644 index 0000000000..5496aa90d8 --- /dev/null +++ b/aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv @@ -0,0 +1,318 @@ +cu_num,M,N,K,q_dtype_w,kernelId,splitK,us,kernelName,tflops,bw,errRatio +80,1,512,7168,torch.float8_e4m3fnuz,5,0,12.4802,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,5.88,294.72,0 +80,1,1280,8192,torch.float8_e4m3fnuz,11,0,14.3155,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,14.65,733.23,0 +80,1,2112,7168,torch.float8_e4m3fnuz,11,0,13.4147,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,22.57,1129.37,0 +80,1,2240,7168,torch.float8_e4m3fnuz,10,0,11.8963,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,26.99,1350.67,0 +80,1,4096,512,torch.float8_e4m3fnuz,9,0,4.2138,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.95,499.75,0 +80,1,4608,4096,torch.float8_e4m3fnuz,11,0,10.5838,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,35.67,1784.58,0 +80,1,4608,7168,torch.float8_e4m3fnuz,5,0,13.8515,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,47.69,2385.77,0 +80,1,7168,256,torch.float8_e4m3fnuz,75,0,6.4606,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,5.68,286.29,0 +80,1,7168,2304,torch.float8_e4m3fnuz,29,0,10.1762,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,32.46,1624.55,0 +80,1,8192,1024,torch.float8_e4m3fnuz,15,0,6.4482,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,26.02,1303.62,0 +80,1,9216,4096,torch.float8_e4m3fnuz,19,0,14.4331,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,52.31,2616.99,0 +80,1,11264,1536,torch.float8_e4m3fnuz,108,0,9.3388,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,37.05,1855.22,0 +80,2,4608,4096,torch.float8_e4m3fnuz,5,0,10.661,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,70.82,1772.91,0 +80,2,9216,4096,torch.float8_e4m3fnuz,5,0,14.4331,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,104.62,2618.55,0 +80,4,4608,4096,torch.float8_e4m3fnuz,5,0,10.4863,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,143.99,1804.99,0 +80,4,9216,4096,torch.float8_e4m3fnuz,11,0,14.6151,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,206.63,2589.02,0 +80,8,4608,4096,torch.float8_e4m3fnuz,5,0,10.8626,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,278.01,1747.36,0 +80,8,9216,4096,torch.float8_e4m3fnuz,5,0,15.0571,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,401.13,2519.01,0 +80,16,512,7168,torch.float8_e4m3fnuz,24,0,10.7923,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,108.82,352.20,0 +80,16,576,7168,torch.float8_e4m3fnuz,10,0,11.091,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,119.12,384.27,0 +80,16,1536,7168,torch.float8_e4m3fnuz,10,0,11.2927,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,311.99,989.48,0 +80,16,2112,7168,torch.float8_e4m3fnuz,24,0,11.7534,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,412.17,1303.55,0 +80,16,2240,7168,torch.float8_e4m3fnuz,10,0,11.1279,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,461.72,1459.64,0 +80,16,3072,1536,torch.float8_e4m3fnuz,11,0,5.8974,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,256.04,820.95,0 +80,16,4096,512,torch.float8_e4m3fnuz,9,0,4.939,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,135.88,452.81,0 +80,16,4608,4096,torch.float8_e4m3fnuz,24,0,10.8403,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,557.16,1760.78,0 +80,16,4608,7168,torch.float8_e4m3fnuz,5,0,14.1863,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,745.06,2346.79,0 +80,16,7168,256,torch.float8_e4m3fnuz,75,0,6.629,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,88.58,312.03,0 +80,16,7168,2048,torch.float8_e4m3fnuz,10,0,8.769,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,535.71,1703.98,0 +80,16,7168,2304,torch.float8_e4m3fnuz,15,0,10.491,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,503.75,1599.59,0 +80,16,9216,4096,torch.float8_e4m3fnuz,11,0,15.5867,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,774.99,2444.98,0 +80,16,11264,1536,torch.float8_e4m3fnuz,10,0,9.7876,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,565.66,1807.03,0 +80,32,512,7168,torch.float8_e4m3fnuz,10,0,11.1022,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,211.56,354.18,0 +80,32,576,7168,torch.float8_e4m3fnuz,10,0,11.199,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,235.95,392.45,0 +80,32,1280,8192,torch.float8_e4m3fnuz,24,0,12.4575,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,538.70,869.34,0 +80,32,1536,7168,torch.float8_e4m3fnuz,19,0,12.6967,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,554.98,892.97,0 +80,32,2112,7168,torch.float8_e4m3fnuz,5,0,13.1607,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,736.20,1178.00,0 +80,32,2240,7168,torch.float8_e4m3fnuz,11,0,12.8275,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,801.09,1280.77,0 +80,32,3072,1536,torch.float8_e4m3fnuz,112,0,6.9922,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,431.90,709.98,0 +80,32,4096,512,torch.float8_e4m3fnuz,9,0,4.8822,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,274.91,486.60,0 +80,32,4608,4096,torch.float8_e4m3fnuz,6,0,13.4935,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,895.22,1430.34,0 +80,32,4608,7168,torch.float8_e4m3fnuz,12,0,19.4019,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,1089.55,1729.44,0 +80,32,7168,256,torch.float8_e4m3fnuz,75,0,6.8302,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,171.94,337.03,0 +80,32,7168,2048,torch.float8_e4m3fnuz,119,0,11.0711,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,848.63,1373.34,0 +80,32,7168,2304,torch.float8_e4m3fnuz,119,0,11.6614,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,906.38,1461.88,0 +80,32,7168,16384,torch.float8_e4m3fnuz,119,0,53.2367,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1411.84,2224.47,0 +80,32,8192,1024,torch.float8_e4m3fnuz,119,0,7.4863,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,717.14,1194.94,0 +80,32,9216,4096,torch.float8_e4m3fnuz,119,0,18.4783,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1307.44,2081.88,0 +80,32,11264,1536,torch.float8_e4m3fnuz,76,0,12.5451,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,882.65,1440.53,0 +80,32,24576,1536,torch.float8_e4m3fnuz,112,0,19.3781,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1246.73,2031.71,0 +80,48,512,7168,torch.float8_e4m3fnuz,10,0,11.2279,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,313.79,361.89,0 +80,48,2112,7168,torch.float8_e4m3fnuz,10,0,18.4707,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,786.83,849.22,0 +80,48,2240,7168,torch.float8_e4m3fnuz,10,0,18.3626,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,839.43,904.85,0 +80,48,4096,512,torch.float8_e4m3fnuz,9,0,5.3534,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,376.07,469.78,0 +80,48,4608,7168,torch.float8_e4m3fnuz,113,0,22.5027,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1409.12,1502.78,0 +80,48,7168,256,torch.float8_e4m3fnuz,75,0,7.0126,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,251.21,361.55,0 +80,48,7168,2304,torch.float8_e4m3fnuz,113,0,13.6727,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1159.57,1266.30,0 +80,48,11264,1536,torch.float8_e4m3fnuz,113,0,15.2534,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1088.90,1210.00,0 +80,64,512,7168,torch.float8_e4m3fnuz,10,0,11.1263,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,422.21,376.97,0 +80,64,576,7168,torch.float8_e4m3fnuz,24,0,11.163,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,473.42,417.56,0 +80,64,1280,8192,torch.float8_e4m3fnuz,19,0,13.7807,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,973.95,810.84,0 +80,64,1536,7168,torch.float8_e4m3fnuz,19,0,16.9363,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,832.11,688.78,0 +80,64,2112,7168,torch.float8_e4m3fnuz,112,0,18.9179,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1024.30,838.78,0 +80,64,2240,7168,torch.float8_e4m3fnuz,12,0,18.6485,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,1102.08,900.97,0 +80,64,3072,1536,torch.float8_e4m3fnuz,114,0,9.0146,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,670.00,577.96,0 +80,64,4096,512,torch.float8_e4m3fnuz,77,0,5.8414,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,459.54,454.38,0 +80,64,4608,4096,torch.float8_e4m3fnuz,114,0,16.3715,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1475.69,1204.92,0 +80,64,4608,7168,torch.float8_e4m3fnuz,114,0,24.7999,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1704.79,1374.15,0 +80,64,7168,256,torch.float8_e4m3fnuz,75,0,7.6886,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,305.49,360.13,0 +80,64,7168,2048,torch.float8_e4m3fnuz,112,0,14.2819,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1315.69,1101.30,0 +80,64,7168,2304,torch.float8_e4m3fnuz,112,0,15.5151,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1362.50,1133.09,0 +80,64,7168,16384,torch.float8_e4m3fnuz,121,0,81.4496,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1845.61,1466.02,0 +80,64,8192,1024,torch.float8_e4m3fnuz,114,0,10.9842,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,977.53,865.13,0 +80,64,9216,4096,torch.float8_e4m3fnuz,114,0,25.0703,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1927.32,1563.23,0 +80,64,11264,1536,torch.float8_e4m3fnuz,112,0,17.4734,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1267.41,1078.30,0 +80,64,24576,1536,torch.float8_e4m3fnuz,93,0,27.4074,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1762.97,1495.68,0 +80,80,512,7168,torch.float8_e4m3fnuz,24,0,11.2303,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,522.87,385.15,0 +80,80,2112,7168,torch.float8_e4m3fnuz,113,0,24.3931,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,992.99,657.98,0 +80,80,2240,7168,torch.float8_e4m3fnuz,113,0,23.5984,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1088.64,719.89,0 +80,80,4096,512,torch.float8_e4m3fnuz,76,0,6.8102,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,492.71,410.19,0 +80,80,4608,7168,torch.float8_e4m3fnuz,115,0,28.9995,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1822.38,1184.19,0 +80,80,7168,256,torch.float8_e4m3fnuz,75,0,8.2998,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,353.75,361.74,0 +80,80,7168,2304,torch.float8_e4m3fnuz,113,0,18.9055,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1397.69,943.97,0 +80,80,11264,1536,torch.float8_e4m3fnuz,100,0,21.6601,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1278.04,887.65,0 +80,96,512,7168,torch.float8_e4m3fnuz,10,0,11.6635,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,604.14,382.08,0 +80,96,2112,7168,torch.float8_e4m3fnuz,113,0,22.3255,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1301.94,727.08,0 +80,96,2240,7168,torch.float8_e4m3fnuz,113,0,21.7057,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1420.28,791.25,0 +80,96,3072,1536,torch.float8_e4m3fnuz,112,0,9.6664,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,937.24,564.42,0 +80,96,4096,512,torch.float8_e4m3fnuz,76,0,6.6322,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,607.12,442.20,0 +80,96,4608,7168,torch.float8_e4m3fnuz,120,0,29.4707,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2151.90,1174.15,0 +80,96,7168,256,torch.float8_e4m3fnuz,75,0,8.6194,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,408.75,375.41,0 +80,96,7168,2048,torch.float8_e4m3fnuz,113,0,17.6925,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1593.09,918.63,0 +80,96,7168,2304,torch.float8_e4m3fnuz,84,0,19.1495,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1655.86,945.85,0 +80,96,7168,16384,torch.float8_e4m3fnuz,113,0,98.4525,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,2290.30,1222.82,0 +80,96,11264,1536,torch.float8_e4m3fnuz,119,0,21.7177,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1529.58,903.03,0 +80,96,24576,1536,torch.float8_e4m3fnuz,94,0,34.225,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2117.68,1245.14,0 +80,112,512,7168,torch.float8_e4m3fnuz,10,0,11.9903,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,685.62,382.60,0 +80,112,2112,7168,torch.float8_e4m3fnuz,112,0,25.9619,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1306.18,632.26,0 +80,112,2240,7168,torch.float8_e4m3fnuz,112,0,26.1124,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1377.36,664.85,0 +80,112,4096,512,torch.float8_e4m3fnuz,76,0,7.3154,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,642.15,419.94,0 +80,112,4608,7168,torch.float8_e4m3fnuz,117,0,37.2568,a8w8_bpreshuffle_256x112x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1985.88,935.81,0 +80,112,7168,256,torch.float8_e4m3fnuz,75,0,9.207,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,446.44,376.81,0 +80,112,7168,2304,torch.float8_e4m3fnuz,119,0,21.7835,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1698.25,843.70,0 +80,112,11264,1536,torch.float8_e4m3fnuz,85,0,25.6868,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1508.77,778.48,0 +80,128,512,7168,torch.float8_e4m3fnuz,19,0,12.3367,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,761.57,382.48,0 +80,128,576,7168,torch.float8_e4m3fnuz,25,0,12.3755,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,854.08,419.68,0 +80,128,1280,8192,torch.float8_e4m3fnuz,6,0,20.4871,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1310.27,579.00,0 +80,128,1536,7168,torch.float8_e4m3fnuz,112,0,22.8955,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1231.06,538.13,0 +80,128,2112,7168,torch.float8_e4m3fnuz,114,0,24.5755,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1576.99,675.35,0 +80,128,2240,7168,torch.float8_e4m3fnuz,114,0,23.9804,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1714.07,731.73,0 +80,128,3072,1536,torch.float8_e4m3fnuz,112,0,11.1838,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1080.10,509.81,0 +80,128,4096,512,torch.float8_e4m3fnuz,76,0,7.2018,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,745.47,445.90,0 +80,128,4608,4096,torch.float8_e4m3fnuz,121,0,24.6387,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1961.08,835.20,0 +80,128,4608,7168,torch.float8_e4m3fnuz,114,0,38.7224,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2183.68,907.16,0 +80,128,7168,256,torch.float8_e4m3fnuz,73,0,10.1406,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,463.25,365.14,0 +80,128,7168,2048,torch.float8_e4m3fnuz,119,0,20.5111,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1832.23,817.96,0 +80,128,7168,2304,torch.float8_e4m3fnuz,84,0,21.9275,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1928.11,850.30,0 +80,128,7168,16384,torch.float8_e4m3fnuz,114,0,119.9291,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2506.88,1012.04,0 +80,128,8192,1024,torch.float8_e4m3fnuz,121,0,14.9703,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1434.50,709.19,0 +80,128,9216,4096,torch.float8_e4m3fnuz,121,0,39.1328,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2469.46,1038.32,0 +80,128,11264,1536,torch.float8_e4m3fnuz,85,0,26.4316,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1675.72,771.11,0 +80,128,24576,1536,torch.float8_e4m3fnuz,85,0,42.5762,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2269.74,1039.00,0 +80,160,2112,7168,torch.float8_e4m3fnuz,115,0,28.9398,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1673.96,586.10,0 +80,160,3072,1536,torch.float8_e4m3fnuz,112,0,11.4913,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1313.99,517.56,0 +80,160,7168,2048,torch.float8_e4m3fnuz,119,0,25.3574,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1852.56,682.31,0 +80,160,7168,16384,torch.float8_e4m3fnuz,136,0,150.746,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2493.00,811.67,0 +80,160,24576,1536,torch.float8_e4m3fnuz,93,0,55.5159,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2175.88,826.05,0 +80,192,1280,8192,torch.float8_e4m3fnuz,113,0,24.3731,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1652.04,514.92,0 +80,192,2112,7168,torch.float8_e4m3fnuz,126,0,32.1086,a8w8_bpreshuffle_256x32x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1810.51,539.61,0 +80,192,2240,7168,torch.float8_e4m3fnuz,113,0,32.2862,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1909.68,566.58,0 +80,192,3072,1536,torch.float8_e4m3fnuz,119,0,12.4325,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1457.42,498.14,0 +80,192,7168,2048,torch.float8_e4m3fnuz,120,0,28.1354,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2003.58,633.57,0 +80,192,7168,16384,torch.float8_e4m3fnuz,120,0,162.2601,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2779.31,760.13,0 +80,192,8192,1024,torch.float8_e4m3fnuz,85,0,18.7963,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1713.76,624.11,0 +80,192,11264,1536,torch.float8_e4m3fnuz,85,0,31.3634,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2118.32,698.96,0 +80,192,24576,1536,torch.float8_e4m3fnuz,93,0,55.8139,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2597.12,850.70,0 +80,224,2112,7168,torch.float8_e4m3fnuz,126,0,32.5198,a8w8_bpreshuffle_256x32x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,2085.56,544.00,0 +80,224,3072,1536,torch.float8_e4m3fnuz,113,0,14.6673,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1441.25,439.00,0 +80,224,7168,2048,torch.float8_e4m3fnuz,85,0,30.5218,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2154.74,601.21,0 +80,224,7168,16384,torch.float8_e4m3fnuz,85,0,185.5598,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2835.39,669.98,0 +80,224,24576,1536,torch.float8_e4m3fnuz,85,0,71.3416,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2370.49,688.28,0 +80,256,512,7168,torch.float8_e4m3fnuz,6,0,17.5859,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1068.50,327.94,0 +80,256,576,7168,torch.float8_e4m3fnuz,112,0,18.3779,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,1150.26,340.56,0 +80,256,1280,8192,torch.float8_e4m3fnuz,114,0,26.7743,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2005.17,494.44,0 +80,256,1536,7168,torch.float8_e4m3fnuz,120,0,31.1851,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1807.64,437.12,0 +80,256,2112,7168,torch.float8_e4m3fnuz,114,0,37.3908,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2072.99,482.88,0 +80,256,2240,7168,torch.float8_e4m3fnuz,114,0,37.1525,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2212.73,512.43,0 +80,256,3072,1536,torch.float8_e4m3fnuz,120,0,16.0983,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1500.73,415.24,0 +80,256,4096,512,torch.float8_e4m3fnuz,76,0,10.9814,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,977.78,393.88,0 +80,256,4608,4096,torch.float8_e4m3fnuz,85,0,37.982,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2544.28,586.65,0 +80,256,4608,7168,torch.float8_e4m3fnuz,121,0,62.2609,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2716.22,597.88,0 +80,256,7168,256,torch.float8_e4m3fnuz,73,0,11.5078,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,816.42,484.07,0 +80,256,7168,2048,torch.float8_e4m3fnuz,85,0,30.5235,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2462.43,618.36,0 +80,256,7168,2304,torch.float8_e4m3fnuz,85,0,32.0436,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2638.82,648.33,0 +80,256,7168,16384,torch.float8_e4m3fnuz,85,0,186.0958,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3231.11,673.34,0 +80,256,8192,1024,torch.float8_e4m3fnuz,70,0,23.9267,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1795.05,536.85,0 +80,256,9216,4096,torch.float8_e4m3fnuz,70,0,64.6741,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2988.42,672.85,0 +80,256,11264,1536,torch.float8_e4m3fnuz,85,0,41.1,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2155.32,570.85,0 +80,256,24576,1536,torch.float8_e4m3fnuz,85,0,72.5312,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2664.70,699.35,0 +80,288,2112,7168,torch.float8_e4m3fnuz,113,0,44.2402,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1971.05,416.36,0 +80,288,3072,1536,torch.float8_e4m3fnuz,120,0,16.2173,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1675.93,427.35,0 +80,288,7168,2048,torch.float8_e4m3fnuz,85,0,38.1402,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2217.01,508.61,0 +80,288,7168,16384,torch.float8_e4m3fnuz,85,0,243.1421,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2782.15,519.40,0 +80,288,24576,1536,torch.float8_e4m3fnuz,94,0,79.4368,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2737.18,658.98,0 +80,320,1280,8192,torch.float8_e4m3fnuz,115,0,32.2959,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2077.94,431.21,0 +80,320,2112,7168,torch.float8_e4m3fnuz,113,0,45.9122,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,2110.30,409.13,0 +80,320,3072,1536,torch.float8_e4m3fnuz,119,0,16.7997,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1797.59,427.16,0 +80,320,7168,2048,torch.float8_e4m3fnuz,85,0,38.8094,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2420.87,513.35,0 +80,320,7168,16384,torch.float8_e4m3fnuz,101,0,239.9889,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3131.89,530.32,0 +80,320,8192,1024,torch.float8_e4m3fnuz,85,0,24.7583,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2168.45,563.82,0 +80,320,24576,1536,torch.float8_e4m3fnuz,93,0,84.1633,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2870.51,641.24,0 +80,352,2112,7168,torch.float8_e4m3fnuz,128,0,49.7251,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2143.33,385.09,0 +80,352,3072,1536,torch.float8_e4m3fnuz,85,0,18.9609,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1751.97,391.43,0 +80,352,7168,2048,torch.float8_e4m3fnuz,100,0,45.5887,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,2266.96,448.52,0 +80,352,7168,16384,torch.float8_e4m3fnuz,86,0,289.7112,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2853.81,442.70,0 +80,352,24576,1536,torch.float8_e4m3fnuz,93,0,101.4118,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2620.51,548.17,0 +80,384,2112,7168,torch.float8_e4m3fnuz,128,0,47.5375,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2445.78,410.48,0 +80,384,3072,1536,torch.float8_e4m3fnuz,85,0,18.7893,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1928.69,408.09,0 +80,384,7168,2048,torch.float8_e4m3fnuz,86,0,45.0074,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2504.99,465.96,0 +80,384,7168,16384,torch.float8_e4m3fnuz,86,0,268.8438,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3354.90,480.71,0 +80,384,24576,1536,torch.float8_e4m3fnuz,93,0,99.8625,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2903.09,572.92,0 +80,512,512,7168,torch.float8_e4m3fnuz,119,0,23.6639,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1588.11,332.33,0 +80,512,576,7168,torch.float8_e4m3fnuz,114,0,23.9199,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1767.51,350.70,0 +80,512,1280,8192,torch.float8_e4m3fnuz,114,0,42.3216,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2537.10,377.84,0 +80,512,1536,7168,torch.float8_e4m3fnuz,128,0,47.65,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2366.06,341.09,0 +80,512,2112,7168,torch.float8_e4m3fnuz,129,0,60.4509,a8w8_bpreshuffle_256x80x192x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2564.42,346.92,0 +80,512,2240,7168,torch.float8_e4m3fnuz,62,0,66.3638,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2477.51,331.81,0 +80,512,3072,1536,torch.float8_e4m3fnuz,85,0,24.3751,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1982.28,354.90,0 +80,512,4096,512,torch.float8_e4m3fnuz,85,0,15.7231,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1365.81,416.81,0 +80,512,4608,7168,torch.float8_e4m3fnuz,70,0,105.5819,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3203.47,392.29,0 +80,512,7168,256,torch.float8_e4m3fnuz,72,0,16.1951,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,1160.26,574.63,0 +80,512,7168,2048,torch.float8_e4m3fnuz,71,0,53.438,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2813.05,431.69,0 +80,512,7168,2304,torch.float8_e4m3fnuz,72,0,57.4769,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2942.30,435.56,0 +80,512,8192,1024,torch.float8_e4m3fnuz,85,0,37.882,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2267.55,456.72,0 +80,512,11264,1536,torch.float8_e4m3fnuz,85,0,63.9678,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2769.63,463.08,0 +80,1024,512,7168,torch.float8_e4m3fnuz,114,0,36.6616,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2050.15,328.92,0 +80,1024,576,7168,torch.float8_e4m3fnuz,114,0,37.1872,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2273.82,340.13,0 +80,1024,1280,8192,torch.float8_e4m3fnuz,85,0,68.1209,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3152.46,315.55,0 +80,1024,1536,7168,torch.float8_e4m3fnuz,136,0,74.0965,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,3043.14,290.11,0 +80,1024,2112,7168,torch.float8_e4m3fnuz,93,0,115.8099,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2677.17,231.45,0 +80,1024,2240,7168,torch.float8_e4m3fnuz,114,0,120.7469,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2723.33,231.76,0 +80,1024,3072,1536,torch.float8_e4m3fnuz,85,0,38.8336,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2488.48,324.02,0 +80,1024,4096,512,torch.float8_e4m3fnuz,85,0,24.9039,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1724.62,442.10,0 +80,1024,4608,4096,torch.float8_e4m3fnuz,71,0,120.44,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3209.46,269.89,0 +80,1024,4608,7168,torch.float8_e4m3fnuz,93,0,189.1551,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3576.20,263.31,0 +80,1024,7168,256,torch.float8_e4m3fnuz,71,0,27.7779,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1352.91,603.98,0 +80,1024,7168,2048,torch.float8_e4m3fnuz,71,0,98.0567,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3066.06,320.81,0 +80,1024,7168,2304,torch.float8_e4m3fnuz,85,0,107.3839,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3149.71,312.47,0 +80,1024,8192,1024,torch.float8_e4m3fnuz,71,0,67.0889,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2560.76,390.74,0 +80,1024,9216,4096,torch.float8_e4m3fnuz,93,0,219.0276,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3529.67,277.67,0 +80,1024,11264,1536,torch.float8_e4m3fnuz,85,0,116.8538,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3032.29,358.94,0 +80,1536,512,7168,torch.float8_e4m3fnuz,120,0,48.0728,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2345.25,338.09,0 +80,1536,576,7168,torch.float8_e4m3fnuz,128,0,47.2304,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2685.47,358.00,0 +80,1536,1536,7168,torch.float8_e4m3fnuz,0,0,105.4423,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3207.71,253.59,0 +80,1536,2112,7168,torch.float8_e4m3fnuz,68,0,148.9129,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3123.06,219.17,0 +80,1536,2240,7168,torch.float8_e4m3fnuz,48,0,177.0619,a8w8_bpreshuffle_256x192x224x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,2785.75,191.73,0 +80,1536,3072,1536,torch.float8_e4m3fnuz,93,0,51.4668,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2816.48,320.89,0 +80,1536,4096,512,torch.float8_e4m3fnuz,85,0,33.2015,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1940.41,465.84,0 +80,1536,4608,7168,torch.float8_e4m3fnuz,85,0,279.9019,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3625.15,207.92,0 +80,1536,7168,256,torch.float8_e4m3fnuz,71,0,37.3284,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1510.15,649.59,0 +80,1536,7168,2048,torch.float8_e4m3fnuz,85,0,138.0452,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3266.84,288.64,0 +80,1536,7168,2304,torch.float8_e4m3fnuz,85,0,150.2725,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3376.15,279.99,0 +80,1536,11264,1536,torch.float8_e4m3fnuz,85,0,168.9501,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3145.91,321.18,0 +80,2048,512,7168,torch.float8_e4m3fnuz,85,0,58.1113,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2586.83,351.86,0 +80,2048,576,7168,torch.float8_e4m3fnuz,129,0,60.7837,a8w8_bpreshuffle_256x80x192x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2782.23,348.25,0 +80,2048,1280,8192,torch.float8_e4m3fnuz,0,0,119.2848,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3600.60,272.51,0 +80,2048,1536,7168,torch.float8_e4m3fnuz,85,0,138.8692,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3247.46,230.30,0 +80,2048,2112,7168,torch.float8_e4m3fnuz,93,0,186.5271,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3324.37,206.24,0 +80,2048,2240,7168,torch.float8_e4m3fnuz,79,0,230.9106,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2848.14,172.84,0 +80,2048,3072,1536,torch.float8_e4m3fnuz,72,0,69.8541,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,2766.82,292.71,0 +80,2048,4096,512,torch.float8_e4m3fnuz,71,0,43.6672,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1967.14,456.25,0 +80,2048,4608,4096,torch.float8_e4m3fnuz,68,0,218.9032,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3531.67,210.77,0 +80,2048,4608,7168,torch.float8_e4m3fnuz,93,0,366.0791,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3695.69,181.89,0 +80,2048,7168,256,torch.float8_e4m3fnuz,71,0,48.0096,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1565.56,660.69,0 +80,2048,7168,2048,torch.float8_e4m3fnuz,85,0,186.7051,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3220.56,258.35,0 +80,2048,7168,2304,torch.float8_e4m3fnuz,85,0,199.0359,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3398.67,254.19,0 +80,2048,8192,1024,torch.float8_e4m3fnuz,71,0,120.4715,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2852.11,365.57,0 +80,2048,9216,4096,torch.float8_e4m3fnuz,68,0,425.2314,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3636.11,197.27,0 +80,2048,11264,1536,torch.float8_e4m3fnuz,71,0,217.6677,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3255.74,305.90,0 +80,4096,512,7168,torch.float8_e4m3fnuz,85,0,101.7067,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2956.03,366.00,0 +80,4096,576,7168,torch.float8_e4m3fnuz,93,0,113.3711,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2983.38,337.01,0 +80,4096,1280,8192,torch.float8_e4m3fnuz,71,0,234.5689,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3662.01,232.45,0 +80,4096,1536,7168,torch.float8_e4m3fnuz,85,0,256.917,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3510.64,206.11,0 +80,4096,2112,7168,torch.float8_e4m3fnuz,93,0,329.2957,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3766.13,187.67,0 +80,4096,2240,7168,torch.float8_e4m3fnuz,69,0,437.1832,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3008.66,145.86,0 +80,4096,3072,1536,torch.float8_e4m3fnuz,93,0,123.6652,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3125.75,292.53,0 +80,4096,4096,512,torch.float8_e4m3fnuz,71,0,75.7421,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2268.21,498.39,0 +80,4096,4608,4096,torch.float8_e4m3fnuz,85,0,420.6946,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3675.32,174.47,0 +80,4096,4608,7168,torch.float8_e4m3fnuz,85,0,715.7879,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3780.21,139.90,0 +80,4096,7168,256,torch.float8_e4m3fnuz,71,0,85.7258,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1753.54,718.61,0 +80,4096,7168,2048,torch.float8_e4m3fnuz,71,0,345.3094,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3482.65,236.86,0 +80,4096,7168,2304,torch.float8_e4m3fnuz,71,0,379.4759,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3565.22,223.13,0 +80,4096,8192,1024,torch.float8_e4m3fnuz,71,0,227.2969,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3023.34,350.61,0 +80,4096,9216,4096,torch.float8_e4m3fnuz,71,0,812.5832,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3805.61,160.01,0 +80,4096,11264,1536,torch.float8_e4m3fnuz,71,0,417.0421,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3398.55,277.83,0 +80,4240,9216,4096,torch.float8_e4m3fnuz,93,0,903.6056,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3542.58,147.48,0 +80,8192,512,7168,torch.float8_e4m3fnuz,85,0,195.7643,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3071.53,361.55,0 +80,8192,576,7168,torch.float8_e4m3fnuz,93,0,191.4823,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3532.74,377.51,0 +80,8192,1280,8192,torch.float8_e4m3fnuz,71,0,452.4299,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3797.24,217.86,0 +80,8192,1536,7168,torch.float8_e4m3fnuz,72,0,501.1581,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,3599.44,189.35,0 +80,8192,2112,7168,torch.float8_e4m3fnuz,68,0,649.3748,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3819.59,167.03,0 +80,8192,2240,7168,torch.float8_e4m3fnuz,69,0,857.9718,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3066.15,129.93,0 +80,8192,3072,1536,torch.float8_e4m3fnuz,71,0,235.4545,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3283.41,287.25,0 +80,8192,4096,512,torch.float8_e4m3fnuz,71,0,140.1016,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2452.49,523.91,0 +80,8192,4608,7168,torch.float8_e4m3fnuz,93,0,1405.0902,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3851.47,119.03,0 +80,8192,7168,256,torch.float8_e4m3fnuz,71,0,158.8437,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1892.73,764.10,0 +80,8192,7168,2048,torch.float8_e4m3fnuz,71,0,665.8308,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3612.30,223.63,0 +80,8192,7168,2304,torch.float8_e4m3fnuz,71,0,736.4592,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3674.11,207.52,0 +80,8192,8192,1024,torch.float8_e4m3fnuz,71,0,434.801,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3160.96,347.27,0 +80,8192,11264,1536,torch.float8_e4m3fnuz,71,0,818.2792,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3464.19,262.05,0 +80,16384,512,7168,torch.float8_e4m3fnuz,85,0,354.1062,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3396.13,389.40,0 +80,16384,576,7168,torch.float8_e4m3fnuz,68,0,372.1747,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3635.16,377.36,0 +80,16384,1280,8192,torch.float8_e4m3fnuz,71,0,902.6619,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3806.49,206.77,0 +80,16384,1536,7168,torch.float8_e4m3fnuz,68,0,940.0969,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3837.66,190.17,0 +80,16384,2112,7168,torch.float8_e4m3fnuz,93,0,1284.8613,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3860.87,157.05,0 +80,16384,2240,7168,torch.float8_e4m3fnuz,69,0,1644.8521,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3198.67,125.78,0 +80,16384,3072,1536,torch.float8_e4m3fnuz,71,0,450.9699,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3428.58,289.48,0 +80,16384,4096,512,torch.float8_e4m3fnuz,71,0,268.2002,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2562.25,539.54,0 +80,16384,4608,4096,torch.float8_e4m3fnuz,93,0,1613.6388,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3832.80,146.86,0 +80,16384,4608,7168,torch.float8_e4m3fnuz,93,0,2770.8265,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3906.17,108.80,0 +80,16384,7168,256,torch.float8_e4m3fnuz,71,0,306.2344,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1963.51,786.69,0 +80,16384,7168,2048,torch.float8_e4m3fnuz,71,0,1322.9659,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3636.04,214.00,0 +80,16384,7168,2304,torch.float8_e4m3fnuz,71,0,1464.8949,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3694.23,197.38,0 +80,16384,8192,1024,torch.float8_e4m3fnuz,71,0,849.5169,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3235.70,345.61,0 +80,16384,9216,4096,torch.float8_e4m3fnuz,71,0,3218.0651,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3843.77,126.43,0 +80,16384,11264,1536,torch.float8_e4m3fnuz,72,0,1650.9527,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,3433.99,249.29,0 +80,20480,512,7168,torch.float8_e4m3fnuz,70,0,411.9813,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3648.80,416.14,0 +80,20480,576,7168,torch.float8_e4m3fnuz,68,0,439.759,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3845.61,396.86,0 +80,20480,1536,7168,torch.float8_e4m3fnuz,93,0,1157.9543,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3894.55,190.62,0 +80,20480,3072,1536,torch.float8_e4m3fnuz,71,0,557.954,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3463.97,290.36,0 +80,20480,4096,512,torch.float8_e4m3fnuz,71,0,331.5621,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2590.75,543.96,0 +80,20480,4608,7168,torch.float8_e4m3fnuz,68,0,3421.2891,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3954.40,107.73,0 +80,20480,7168,256,torch.float8_e4m3fnuz,71,0,381.5807,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1969.75,787.98,0 +80,20480,7168,2048,torch.float8_e4m3fnuz,71,0,1639.9077,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3666.64,213.56,0 +80,20480,7168,2304,torch.float8_e4m3fnuz,71,0,1805.4245,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3746.80,197.90,0 +80,32768,512,7168,torch.float8_e4m3fnuz,70,0,664.7692,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3618.07,409.32,0 +80,32768,2112,7168,torch.float8_e4m3fnuz,93,0,2566.2639,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3866.08,151.36,0 +80,32768,2240,7168,torch.float8_e4m3fnuz,69,0,3196.032,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3292.42,124.45,0 +80,32768,4096,512,torch.float8_e4m3fnuz,71,0,523.3298,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2626.24,549.00,0 +80,32768,4608,4096,torch.float8_e4m3fnuz,93,0,3207.2182,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3856.77,141.89,0 +80,32768,4608,7168,torch.float8_e4m3fnuz,68,0,5527.6346,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3916.08,103.10,0 +80,32768,7168,256,torch.float8_e4m3fnuz,71,0,600.9077,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2001.29,798.77,0 +80,32768,7168,2304,torch.float8_e4m3fnuz,71,0,2907.7206,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3722.27,193.20,0 +80,32768,9216,4096,torch.float8_e4m3fnuz,68,0,6437.3643,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3843.03,120.54,0 +80,32768,11264,1536,torch.float8_e4m3fnuz,72,0,3296.9348,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,3439.17,244.42,0 +80,49152,2112,7168,torch.float8_e4m3fnuz,93,0,3805.7733,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3910.39,151.11,0 +80,49152,2240,7168,torch.float8_e4m3fnuz,69,0,4812.8899,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3279.53,122.29,0 +80,49152,11264,1536,torch.float8_e4m3fnuz,71,0,4796.016,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3546.29,250.23,0 +80,65536,2112,7168,torch.float8_e4m3fnuz,93,0,5062.5452,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3919.52,150.46,0 +80,65536,2240,7168,torch.float8_e4m3fnuz,69,0,6365.1402,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3306.34,122.45,0 +80,65536,11264,1536,torch.float8_e4m3fnuz,71,0,6394.7949,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3546.23,249.32,0 +80,73728,2112,7168,torch.float8_e4m3fnuz,93,0,5700.3689,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3916.08,150.00,0 +80,73728,2240,7168,torch.float8_e4m3fnuz,69,0,7155.8623,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3308.62,122.26,0 +80,73728,11264,1536,torch.float8_e4m3fnuz,71,0,7197.2388,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3544.71,248.91,0 +80,131072,2112,7168,torch.float8_e4m3fnuz,93,0,10093.2233,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3931.90,149.44,0 +80,131072,2240,7168,torch.float8_e4m3fnuz,69,0,12728.0402,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3306.93,121.21,0 +80,131072,11264,1536,torch.float8_e4m3fnuz,71,0,12851.8716,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3529.05,246.77,0 diff --git a/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle.cu b/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle.cu index 38610e41b4..e92d897b05 100755 --- a/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle.cu +++ b/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle.cu @@ -46,7 +46,7 @@ RowwiseKernel rowwise_heuristic_dispatch(int M, int N, int K) } else { - if(N < 1536) + if(N < 1536 || N % 128 != 0) { return a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3< DDataType, diff --git a/setup.py b/setup.py index bb7e63ddc9..3c3a146bc3 100644 --- a/setup.py +++ b/setup.py @@ -232,7 +232,7 @@ def has_ext_modules(self): python_requires=">=3.8", install_requires=[ "pybind11>=3.0.1", - # "ninja", + "ninja", "pandas", "einops", "psutil",