diff --git a/.github/scripts/build_aiter_triton.sh b/.github/scripts/build_aiter_triton.sh index 3965c3a44d..fb4af1293d 100755 --- a/.github/scripts/build_aiter_triton.sh +++ b/.github/scripts/build_aiter_triton.sh @@ -9,7 +9,15 @@ dpkg -l | grep rocm || echo "No ROCm packages found." echo echo "==== Install dependencies and aiter ====" git config --global --add safe.directory /workspace -pip install --upgrade pandas zmq einops numpy==1.26.2 +pip config set global.retries 15 +pip config set global.timeout 120 +pip install --upgrade pandas pyzmq einops numpy==1.26.2 || { + echo "WARNING: batch pip install failed, retrying packages individually..." + pip install --upgrade pandas || true + pip install --upgrade pyzmq || echo "WARNING: pyzmq unavailable (only needed by aiter.dist.shm_broadcast)" + pip install --upgrade einops + pip install --upgrade "numpy==1.26.2" +} pip uninstall -y aiter || true pip install --upgrade "pybind11>=3.0.1" pip install --upgrade "ninja>=1.11.1" diff --git a/.github/workflows/aiter-test.yaml b/.github/workflows/aiter-test.yaml index 5bfcdc728e..e02b42d612 100644 --- a/.github/workflows/aiter-test.yaml +++ b/.github/workflows/aiter-test.yaml @@ -102,7 +102,7 @@ jobs: shopt -s nullglob && rm -rf dist build aiter_meta ./*.egg-info && pip install -r requirements.txt && - pip install --upgrade pandas zmq einops numpy==1.26.2 && + pip install --upgrade pandas pyzmq einops numpy==1.26.2 && pip install --upgrade "pybind11>=3.0.1" && pip install --upgrade "ninja>=1.11.1" && pip install --upgrade setuptools_scm && @@ -372,7 +372,7 @@ jobs: bash -lc " pip uninstall -y amd-aiter aiter || true pip install -r requirements.txt - pip install --upgrade pandas zmq einops numpy==1.26.2 + pip install --upgrade pandas pyzmq einops numpy==1.26.2 pip install --upgrade 'pybind11>=3.0.1' pip install --upgrade 'ninja>=1.11.1' pip install tabulate @@ -573,7 +573,7 @@ jobs: bash -lc " pip uninstall -y amd-aiter aiter || true pip install -r requirements.txt - pip install --upgrade pandas zmq einops numpy==1.26.2 + pip install --upgrade pandas pyzmq einops numpy==1.26.2 pip install --upgrade 'pybind11>=3.0.1' pip install --upgrade 'ninja>=1.11.1' pip install tabulate diff --git a/.github/workflows/vllm_benchmark.yaml b/.github/workflows/vllm_benchmark.yaml index c1402660ad..16bc1494b2 100644 --- a/.github/workflows/vllm_benchmark.yaml +++ b/.github/workflows/vllm_benchmark.yaml @@ -99,7 +99,7 @@ jobs: pip config set global.retries 10 pip config set global.index-url https://ausartifactory.amd.com/artifactory/api/pypi/hw-cpe-prod-remote/simple pip install -r requirements.txt - pip install --upgrade pandas zmq einops numpy==1.26.2 + pip install --upgrade pandas pyzmq einops numpy==1.26.2 pip install --upgrade "pybind11>=3.0.1" pip install --upgrade "ninja>=1.11.1" pip install --upgrade "setuptools_scm[toml]>=6.2" wheel packaging psutil diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_next_80b_a3b.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_next_80b_a3b.csv new file mode 100644 index 0000000000..77801ed548 --- /dev/null +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_next_80b_a3b.csv @@ -0,0 +1,1483 @@ +gfx,cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +gfx950,256,1,256,2048,ck,8,3,5.3821,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.19,97.89,0.0 +gfx950,256,2,256,2048,ck,8,3,5.3824,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.39,98.36,0.0 +gfx950,256,4,256,2048,ck,8,3,5.3455,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.78,100.0,0.0 +gfx950,256,8,256,2048,ck,8,3,5.4095,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.55,100.71,0.0 +gfx950,256,16,256,2048,ck,8,3,5.4627,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.07,103.47,0.0 +gfx950,256,32,256,2048,ck,8,3,6.116,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.49,99.12,0.0 +gfx950,256,48,256,2048,ck,8,3,6.2752,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.02,103.13,0.0 +gfx950,256,64,256,2048,ck,8,2,5.9001,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.37,116.63,0.0 +gfx950,256,80,256,2048,ck,6,3,6.174,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,13.59,118.09,0.0 +gfx950,256,96,256,2048,ck,8,2,6.0132,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.74,128.06,0.0 +gfx950,256,112,256,2048,ck,8,3,6.4352,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.25,126.03,0.0 +gfx950,256,128,256,2048,ck,8,3,6.6216,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.27,128.66,0.0 +gfx950,256,256,256,2048,ck,8,2,5.9766,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,44.91,197.38,0.0 +gfx950,256,512,256,2048,ck,8,2,6.5051,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,82.53,282.09,0.0 +gfx950,256,588,256,2048,ck,8,2,7.3827,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,83.51,274.91,0.0 +gfx950,256,773,256,2048,ck,8,0,8.2754,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,97.95,302.48,0.0 +gfx950,256,822,256,2048,ck,8,0,8.386,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,102.78,313.45,0.0 +gfx950,256,1024,256,2048,ck,8,0,8.3191,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,129.07,378.13,0.0 +gfx950,256,1025,256,2048,ck,8,2,9.6064,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,111.88,327.73,0.0 +gfx950,256,1027,256,2048,ck,8,2,9.5908,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,112.28,328.8,0.0 +gfx950,256,1042,256,2048,ck,7,0,9.5431,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,114.49,334.46,0.0 +gfx950,256,1051,256,2048,ck,8,2,9.6922,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,113.71,331.69,0.0 +gfx950,256,1055,256,2048,ck,8,2,9.4241,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,117.38,342.22,0.0 +gfx950,256,1057,256,2048,ck,7,0,9.6419,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,114.95,335.02,0.0 +gfx950,256,1069,256,2048,ck,8,2,9.0751,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,123.52,359.33,0.0 +gfx950,256,1072,256,2048,ck,8,2,8.8589,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,126.89,368.96,0.0 +gfx950,256,1074,256,2048,ck,8,2,9.6548,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,116.64,339.08,0.0 +gfx950,256,1091,256,2048,ck,13,0,9.7891,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,116.86,338.87,0.0 +gfx950,256,1128,256,2048,ck,8,2,9.7553,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,121.25,349.76,0.0 +gfx950,256,1136,256,2048,ck,8,2,9.9427,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,119.8,345.22,0.0 +gfx950,256,1514,256,2048,ck,7,0,10.1961,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,155.7,431.55,0.0 +gfx950,256,2017,256,2048,ck,7,0,9.9017,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,213.6,574.43,0.0 +gfx950,256,2048,256,2048,ck,7,0,9.9236,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,216.4,581.16,0.0 +gfx950,256,2151,256,2048,ck,18,0,11.5669,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,194.99,521.39,0.0 +gfx950,256,2332,256,2048,ck,18,0,10.9901,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,222.5,590.91,0.0 +gfx950,256,2341,256,2048,ck,18,0,11.7789,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,208.4,553.3,0.0 +gfx950,256,3072,256,2048,ck,18,0,10.4911,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,307.04,799.59,0.0 +gfx950,256,3073,256,2048,ck,18,0,12.5746,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,256.25,667.31,0.0 +gfx950,256,3089,256,2048,ck,18,0,12.4563,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,260.03,676.94,0.0 +gfx950,256,3110,256,2048,ck,18,0,12.4643,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,261.63,680.82,0.0 +gfx950,256,3137,256,2048,ck,18,0,12.2899,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,267.65,696.1,0.0 +gfx950,256,3613,256,2048,ck,18,0,12.4583,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,304.09,784.5,0.0 +gfx950,256,4096,256,2048,ck,18,0,10.7697,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,398.8,1022.32,0.0 +gfx950,256,4097,256,2048,ck,2,0,15.3295,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,280.25,718.39,0.0 +gfx950,256,4109,256,2048,ck,2,0,15.4991,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,277.99,712.51,0.0 +gfx950,256,4143,256,2048,ck,2,0,15.4969,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,280.33,718.23,0.0 +gfx950,256,4345,256,2048,ck,3,0,15.3931,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,295.98,756.67,0.0 +gfx950,256,4413,256,2048,ck,2,0,15.2946,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,302.55,772.92,0.0 +gfx950,256,5121,256,2048,ck,2,0,15.4833,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,346.81,880.56,0.0 +gfx950,256,5123,256,2048,ck,2,0,15.9826,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,336.11,853.38,0.0 +gfx950,256,5133,256,2048,ck,2,0,15.5043,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,347.15,881.35,0.0 +gfx950,256,5185,256,2048,ck,2,0,15.4368,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,352.2,893.83,0.0 +gfx950,256,5191,256,2048,ck,2,0,15.4253,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,352.87,895.49,0.0 +gfx950,256,6146,256,2048,ck,2,0,16.0915,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,400.49,1010.35,0.0 +gfx950,256,6147,256,2048,ck,2,0,15.9455,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,404.23,1019.76,0.0 +gfx950,256,6155,256,2048,ck,2,0,16.1409,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,399.85,1008.69,0.0 +gfx950,256,6377,256,2048,ck,2,0,16.0291,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,417.16,1051.18,0.0 +gfx950,256,6401,256,2048,ck,2,0,16.1527,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,415.53,1046.94,0.0 +gfx950,256,7121,256,2048,ck,2,0,16.0775,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,464.43,1166.48,0.0 +gfx950,256,7169,256,2048,ck,2,0,16.3494,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,459.79,1154.59,0.0 +gfx950,256,7170,256,2048,ck,2,0,15.6111,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,481.6,1209.36,0.0 +gfx950,256,7176,256,2048,ck,2,0,15.7281,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,478.42,1201.34,0.0 +gfx950,256,7177,256,2048,ck,2,0,15.7152,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,478.88,1202.49,0.0 +gfx950,256,7185,256,2048,ck,2,0,15.608,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,482.7,1212.06,0.0 +gfx950,256,7206,256,2048,ck,2,0,15.7361,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,480.17,1205.61,0.0 +gfx950,256,7217,256,2048,ck,2,0,15.7324,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,481.02,1207.69,0.0 +gfx950,256,7257,256,2048,ck,2,0,15.6396,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,486.55,1221.4,0.0 +gfx950,256,7265,256,2048,ck,2,0,15.7144,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,484.77,1216.89,0.0 +gfx950,256,7393,256,2048,ck,2,0,15.7346,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,492.68,1236.15,0.0 +gfx950,256,7461,256,2048,ck,2,0,16.3523,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,478.43,1200.1,0.0 +gfx950,256,7634,256,2048,ck,2,0,15.8494,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,505.06,1266.13,0.0 +gfx950,256,8129,256,2048,ck,2,0,15.8496,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,537.8,1346.06,0.0 +gfx950,256,8192,256,2048,ck,2,0,15.4491,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,556.02,1391.4,0.0 +gfx950,256,1,1024,2048,ck,8,3,5.5699,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.75,377.25,0.0 +gfx950,256,2,1024,2048,ck,8,3,5.5539,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.51,379.07,0.0 +gfx950,256,4,1024,2048,ck,8,3,5.5885,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.0,378.19,0.0 +gfx950,256,8,1024,2048,ck,8,3,5.6826,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.9,374.81,0.0 +gfx950,256,16,1024,2048,ck,8,3,6.0772,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.04,355.87,0.0 +gfx950,256,32,1024,2048,ck,8,2,6.8076,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.72,327.31,0.0 +gfx950,256,48,1024,2048,ck,8,2,6.7293,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,29.92,340.86,0.0 +gfx950,256,64,1024,2048,ck,8,2,6.535,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.08,361.02,0.0 +gfx950,256,80,1024,2048,ck,8,2,7.2538,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,46.26,334.28,0.0 +gfx950,256,96,1024,2048,ck,8,2,7.0457,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,57.15,353.46,0.0 +gfx950,256,112,1024,2048,ck,8,2,7.4564,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,63.0,342.78,0.0 +gfx950,256,128,1024,2048,ck,8,2,7.5121,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,71.47,348.96,0.0 +gfx950,256,256,1024,2048,ck,8,0,8.1467,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,131.8,386.14,0.0 +gfx950,256,512,1024,2048,ck,18,0,9.6948,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,221.51,432.63,0.0 +gfx950,256,667,1024,2048,ck,18,0,11.3223,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,247.09,426.52,0.0 +gfx950,256,671,1024,2048,ck,18,0,11.1567,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,252.26,434.32,0.0 +gfx950,256,931,1024,2048,ck,18,0,11.4872,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,339.93,514.53,0.0 +gfx950,256,1024,1024,2048,ck,18,0,10.5216,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,408.2,597.96,0.0 +gfx950,256,1025,1024,2048,ck,2,0,14.903,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,288.48,422.44,0.0 +gfx950,256,1027,1024,2048,ck,2,0,14.9158,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,288.79,422.62,0.0 +gfx950,256,1031,1024,2048,ck,2,0,15.0035,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,288.22,421.24,0.0 +gfx950,256,1039,1024,2048,ck,3,0,15.2155,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,286.41,417.53,0.0 +gfx950,256,1040,1024,2048,ck,2,0,15.0975,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,288.93,421.06,0.0 +gfx950,256,1041,1024,2048,ck,2,0,14.9686,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,291.7,424.96,0.0 +gfx950,256,1072,1024,2048,ck,2,0,15.4859,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,290.35,418.97,0.0 +gfx950,256,1073,1024,2048,ck,2,0,15.5038,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,290.28,418.75,0.0 +gfx950,256,1077,1024,2048,ck,2,0,15.0568,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,300.01,432.27,0.0 +gfx950,256,1091,1024,2048,ck,2,0,14.9712,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,305.65,438.57,0.0 +gfx950,256,1343,1024,2048,ck,2,0,15.2254,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,369.97,499.04,0.0 +gfx950,256,1688,1024,2048,ck,2,0,15.5828,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,454.35,578.28,0.0 +gfx950,256,2017,1024,2048,ck,2,0,15.7146,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,538.35,659.18,0.0 +gfx950,256,2048,1024,2048,ck,2,0,15.6905,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,547.46,668.29,0.0 +gfx950,256,2050,1024,2048,ck,3,0,18.9571,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,453.57,553.56,0.0 +gfx950,256,2063,1024,2048,ck,3,0,19.0224,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,454.88,554.46,0.0 +gfx950,256,2064,1024,2048,ck,3,0,19.048,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,454.49,553.93,0.0 +gfx950,256,2099,1024,2048,ck,3,0,19.0723,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,461.6,560.74,0.0 +gfx950,256,2159,1024,2048,ck,3,0,19.2061,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,471.49,569.63,0.0 +gfx950,256,2160,1024,2048,ck,3,0,19.248,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,470.68,568.61,0.0 +gfx950,256,2313,1024,2048,ck,3,0,19.3877,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,500.39,596.83,0.0 +gfx950,256,2325,1024,2048,ck,3,0,19.2542,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,506.47,603.52,0.0 +gfx950,256,3072,1024,2048,ck,0,0,19.6534,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,655.61,746.95,0.0 +gfx950,256,3073,1024,2048,ck,0,0,20.4847,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,629.21,716.84,0.0 +gfx950,256,3182,1024,2048,ck,0,0,20.6063,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,647.68,734.27,0.0 +gfx950,256,3183,1024,2048,ck,0,0,20.7299,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,644.02,730.09,0.0 +gfx950,256,3847,1024,2048,ck,0,0,21.2034,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,760.99,842.06,0.0 +gfx950,256,4096,1024,2048,ck,0,0,20.451,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,840.05,922.91,0.0 +gfx950,256,4111,1024,2048,ck,2,0,27.8924,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,618.19,678.89,0.0 +gfx950,256,4141,1024,2048,ck,2,0,28.1871,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,616.19,676.15,0.0 +gfx950,256,4142,1024,2048,ck,2,0,28.1318,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,617.55,677.62,0.0 +gfx950,256,4345,1024,2048,ck,2,0,28.6182,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,636.81,695.16,0.0 +gfx950,256,5105,1024,2048,ck,2,0,29.6312,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,722.61,776.45,0.0 +gfx950,256,5121,1024,2048,ck,2,0,29.6364,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,724.75,778.53,0.0 +gfx950,256,5123,1024,2048,ck,2,0,28.7739,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,746.77,802.15,0.0 +gfx950,256,5131,1024,2048,ck,2,0,29.259,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,735.53,789.97,0.0 +gfx950,256,5427,1024,2048,ck,2,0,29.7766,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,764.44,816.96,0.0 +gfx950,256,6113,1024,2048,ck,2,0,31.1336,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,823.54,871.6,0.0 +gfx950,256,6197,1024,2048,ck,0,0,32.0735,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,810.39,856.78,0.0 +gfx950,256,6409,1024,2048,ck,0,0,31.7421,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,846.87,893.09,0.0 +gfx950,256,7121,1024,2048,ck,0,0,33.4641,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,892.53,934.28,0.0 +gfx950,256,7168,1024,2048,ck,0,0,32.2561,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,932.06,975.24,0.0 +gfx950,256,7169,1024,2048,ck,0,0,33.1753,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,906.37,948.34,0.0 +gfx950,256,7172,1024,2048,ck,0,0,33.5396,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,896.9,938.4,0.0 +gfx950,256,7176,1024,2048,ck,0,0,33.4608,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,899.51,941.1,0.0 +gfx950,256,7177,1024,2048,ck,0,0,33.5128,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,898.24,939.76,0.0 +gfx950,256,7178,1024,2048,ck,0,0,33.6505,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,894.69,936.04,0.0 +gfx950,256,7179,1024,2048,ck,0,0,33.8196,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,890.34,931.48,0.0 +gfx950,256,7183,1024,2048,ck,0,0,33.1444,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,908.98,950.95,0.0 +gfx950,256,7184,1024,2048,ck,0,0,32.9884,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,913.41,955.57,0.0 +gfx950,256,7209,1024,2048,ck,0,0,33.3848,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,905.7,947.29,0.0 +gfx950,256,7210,1024,2048,ck,0,0,32.9932,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,916.58,958.66,0.0 +gfx950,256,7211,1024,2048,ck,0,0,32.9871,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,916.88,958.96,0.0 +gfx950,256,7217,1024,2048,ck,0,0,33.4568,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,904.76,946.23,0.0 +gfx950,256,7273,1024,2048,ck,0,0,33.0212,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,923.81,965.66,0.0 +gfx950,256,7274,1024,2048,ck,0,0,33.3125,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,915.85,957.34,0.0 +gfx950,256,7391,1024,2048,ck,0,0,34.0889,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,909.39,949.6,0.0 +gfx950,256,7393,1024,2048,ck,0,0,33.6873,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,920.48,961.16,0.0 +gfx950,256,7715,1024,2048,ck,0,0,34.5112,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,937.64,976.43,0.0 +gfx950,256,7753,1024,2048,ck,0,0,34.0489,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,955.05,994.26,0.0 +gfx950,256,7819,1024,2048,ck,0,0,34.428,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,952.58,991.16,0.0 +gfx950,256,8099,1024,2048,ck,0,0,34.8149,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,975.72,1013.09,0.0 +gfx950,256,8129,1024,2048,ck,0,0,34.7801,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,980.32,1017.64,0.0 +gfx950,256,8188,1024,2048,ck,0,0,34.4069,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,998.14,1035.7,0.0 +gfx950,256,8192,1024,2048,ck,0,0,34.2033,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1004.57,1042.34,0.0 +gfx950,256,1,2048,128,ck,6,0,3.4051,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0.15,78.23,0.0 +gfx950,256,1,2048,512,ck,8,0,4.0125,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.52,262.48,0.0 +gfx950,256,1,2048,1024,ck,8,0,5.1914,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.81,404.95,0.0 +gfx950,256,1,2048,2048,ck,8,3,5.988,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.4,701.48,0.0 +gfx950,256,1,2048,4096,ck,8,3,6.7023,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.5,1252.82,0.0 +gfx950,256,2,2048,128,ck,6,0,3.4162,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0.31,79.21,0.0 +gfx950,256,2,2048,512,ck,8,0,4.0472,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.04,261.36,0.0 +gfx950,256,2,2048,1024,ck,8,0,5.2099,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.61,404.5,0.0 +gfx950,256,2,2048,2048,ck,8,3,6.0141,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.79,699.45,0.0 +gfx950,256,2,2048,4096,ck,8,3,6.731,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.99,1248.7,0.0 +gfx950,256,4,2048,128,ck,6,0,3.4658,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0.61,80.51,0.0 +gfx950,256,4,2048,512,ck,8,0,4.0338,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.08,264.52,0.0 +gfx950,256,4,2048,1024,ck,8,0,5.2488,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.2,403.45,0.0 +gfx950,256,4,2048,2048,ck,8,3,6.0972,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.5,691.94,0.0 +gfx950,256,4,2048,4096,ck,8,3,6.8665,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.77,1226.44,0.0 +gfx950,256,8,2048,128,ck,6,0,3.4811,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,1.2,85.01,0.0 +gfx950,256,8,2048,512,ck,8,0,4.0618,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.13,267.23,0.0 +gfx950,256,8,2048,1024,ck,8,0,5.2043,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.45,410.84,0.0 +gfx950,256,8,2048,2048,ck,8,3,6.5345,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.27,649.39,0.0 +gfx950,256,8,2048,4096,ck,8,3,7.2892,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.41,1159.82,0.0 +gfx950,256,16,2048,128,ck,6,0,3.477,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,2.41,94.83,0.0 +gfx950,256,16,2048,512,ck,8,0,4.0034,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.38,280.34,0.0 +gfx950,256,16,2048,1024,ck,8,0,5.1105,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,13.13,426.39,0.0 +gfx950,256,16,2048,2048,ck,8,3,6.7454,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.9,636.38,0.0 +gfx950,256,16,2048,4096,ck,8,3,7.5896,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,35.37,1122.55,0.0 +gfx950,256,24,2048,2048,ck,8,2,7.1413,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,28.19,607.98,0.0 +gfx950,256,32,2048,128,ck,11,0,3.6966,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_2x1_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,4.54,107.48,0.0 +gfx950,256,32,2048,512,ck,8,0,4.2461,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,15.8,281.68,0.0 +gfx950,256,32,2048,1024,ck,8,0,5.3628,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.03,421.61,0.0 +gfx950,256,32,2048,2048,ck,8,2,7.3617,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.46,596.45,0.0 +gfx950,256,32,2048,4096,ck,8,3,8.4293,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,63.69,1026.27,0.0 +gfx950,256,40,2048,2048,ck,8,2,7.8372,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,42.81,566.54,0.0 +gfx950,256,48,2048,128,ck,6,0,3.7178,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,6.77,125.05,0.0 +gfx950,256,48,2048,512,ck,8,0,4.2755,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.54,296.99,0.0 +gfx950,256,48,2048,1024,ck,8,0,5.4089,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,37.22,433.16,0.0 +gfx950,256,48,2048,2048,ck,8,2,7.8546,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,51.26,571.54,0.0 +gfx950,256,48,2048,4096,ck,8,3,9.1141,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,88.36,963.54,0.0 +gfx950,256,56,2048,2048,ck,8,2,7.9912,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,58.78,567.92,0.0 +gfx950,256,64,2048,128,ck,6,0,3.601,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,9.32,147.87,0.0 +gfx950,256,64,2048,512,ck,8,0,4.1633,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,32.24,322.7,0.0 +gfx950,256,64,2048,1024,ck,8,0,5.3642,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.04,452.04,0.0 +gfx950,256,64,2048,2048,ck,8,2,7.5921,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,70.71,604.25,0.0 +gfx950,256,64,2048,4096,ck,8,3,10.0486,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,106.85,886.98,0.0 +gfx950,256,72,2048,2048,ck,8,2,8.356,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,72.28,554.89,0.0 +gfx950,256,80,2048,128,ck,6,0,3.6346,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,11.54,165.1,0.0 +gfx950,256,80,2048,512,ck,8,0,4.2018,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,39.93,337.29,0.0 +gfx950,256,80,2048,1024,ck,8,0,5.417,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,61.94,462.76,0.0 +gfx950,256,80,2048,2048,ck,8,0,8.1754,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,82.09,573.16,0.0 +gfx950,256,80,2048,4096,ck,8,3,10.8228,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,124.01,835.64,0.0 +gfx950,256,88,2048,2048,ck,8,2,8.0804,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,91.36,585.98,0.0 +gfx950,256,96,2048,128,ck,6,0,3.6695,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,13.72,181.95,0.0 +gfx950,256,96,2048,512,ck,8,0,4.2287,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.61,352.58,0.0 +gfx950,256,96,2048,1024,ck,8,0,5.4506,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,73.87,474.93,0.0 +gfx950,256,96,2048,2048,ck,8,0,8.1093,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,99.31,589.96,0.0 +gfx950,256,96,2048,4096,ck,8,3,11.2319,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,143.4,816.87,0.0 +gfx950,256,104,2048,2048,ck,8,0,8.4246,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,103.56,573.71,0.0 +gfx950,256,112,2048,128,ck,6,0,3.8023,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,15.44,193.37,0.0 +gfx950,256,112,2048,512,ck,8,0,4.427,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,53.06,353.44,0.0 +gfx950,256,112,2048,1024,ck,8,0,5.5917,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,84.01,477.6,0.0 +gfx950,256,112,2048,2048,ck,8,0,8.175,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,114.93,597.24,0.0 +gfx950,256,112,2048,4096,ck,8,3,12.371,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,151.89,752.25,0.0 +gfx950,256,120,2048,2048,ck,8,0,8.4214,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,119.53,585.6,0.0 +gfx950,256,128,2048,128,ck,6,0,3.8449,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,17.45,208.8,0.0 +gfx950,256,128,2048,512,ck,8,0,4.4313,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,60.58,369.73,0.0 +gfx950,256,128,2048,1024,ck,8,0,5.6674,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,94.73,485.67,0.0 +gfx950,256,128,2048,2048,ck,8,0,8.2927,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,129.48,600.62,0.0 +gfx950,256,128,2048,4096,ck,8,3,13.1897,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,162.82,715.5,0.0 +gfx950,256,136,2048,2048,ck,7,0,9.9875,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,114.23,503.62,0.0 +gfx950,256,144,2048,2048,ck,7,0,9.9009,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,122.01,512.99,0.0 +gfx950,256,152,2048,2048,ck,7,0,10.0404,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,126.99,510.76,0.0 +gfx950,256,160,2048,2048,ck,7,0,9.9491,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,134.9,520.38,0.0 +gfx950,256,168,2048,2048,ck,7,0,10.0554,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,140.15,519.77,0.0 +gfx950,256,176,2048,2048,ck,7,0,9.9808,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,147.92,528.58,0.0 +gfx950,256,184,2048,2048,ck,7,0,10.1608,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,151.91,524.05,0.0 +gfx950,256,192,2048,2048,ck,7,0,10.0305,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,160.57,535.76,0.0 +gfx950,256,200,2048,2048,ck,7,0,9.8391,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,170.52,551.18,0.0 +gfx950,256,208,2048,2048,ck,7,0,9.7873,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,178.27,559.12,0.0 +gfx950,256,216,2048,2048,ck,7,0,9.6605,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,187.56,571.54,0.0 +gfx950,256,224,2048,2048,ck,13,0,9.7631,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,192.46,570.57,0.0 +gfx950,256,232,2048,2048,ck,7,0,9.4965,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,204.93,591.77,0.0 +gfx950,256,240,2048,2048,ck,8,2,12.0741,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,166.74,469.51,0.0 +gfx950,256,248,2048,2048,ck,7,0,9.6314,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,216.0,593.68,0.0 +gfx950,256,256,2048,128,cktile,7,0,4.7272,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,28.39,284.2,0.0 +gfx950,256,256,2048,512,ck,13,0,4.5643,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,117.62,488.19,0.0 +gfx950,256,256,2048,1024,ck,13,0,6.379,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,168.32,534.23,0.0 +gfx950,256,256,2048,2048,ck,7,0,9.6979,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,221.44,594.68,0.0 +gfx950,256,256,2048,4096,ck,18,0,16.1668,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,265.67,648.6,0.0 +gfx950,256,272,2048,2048,ck,18,0,10.8286,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,210.71,541.66,0.0 +gfx950,256,288,2048,2048,ck,18,0,10.9014,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,221.62,547.07,0.0 +gfx950,256,304,2048,2048,ck,18,0,11.1628,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,228.45,543.06,0.0 +gfx950,256,320,2048,2048,ck,18,0,10.0339,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,267.53,613.96,0.0 +gfx950,256,336,2048,2048,ck,18,0,11.3481,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,248.37,551.52,0.0 +gfx950,256,352,2048,2048,ck,12,0,11.1248,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,265.42,571.43,0.0 +gfx950,256,368,2048,2048,ck,18,0,11.1734,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,276.28,577.74,0.0 +gfx950,256,384,2048,2048,ck,12,0,11.4778,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,280.65,570.98,0.0 +gfx950,256,400,2048,2048,ck,18,0,10.9292,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,307.02,608.64,0.0 +gfx950,256,416,2048,2048,ck,18,0,11.2714,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,309.6,598.88,0.0 +gfx950,256,432,2048,2048,ck,18,0,11.4322,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,316.99,599.05,0.0 +gfx950,256,448,2048,2048,ck,18,0,10.382,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,361.98,669.12,0.0 +gfx950,256,464,2048,2048,ck,18,0,11.7079,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,332.45,601.74,0.0 +gfx950,256,480,2048,2048,ck,18,0,11.4589,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,351.39,623.4,0.0 +gfx950,256,496,2048,2048,ck,18,0,11.6726,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,356.45,620.4,0.0 +gfx950,256,512,2048,128,ck,16,0,4.5921,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,58.46,528.04,0.0 +gfx950,256,512,2048,512,ck,18,0,5.479,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,195.97,621.99,0.0 +gfx950,256,512,2048,1024,ck,18,0,7.307,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,293.89,645.76,0.0 +gfx950,256,512,2048,2048,ck,18,0,10.1647,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,422.54,722.11,0.0 +gfx950,256,512,2048,4096,ck,18,0,16.8136,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,510.89,748.38,0.0 +gfx950,256,588,2048,128,ck,6,0,5.2126,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,59.14,526.77,0.0 +gfx950,256,588,2048,1024,ck,3,0,9.9379,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,248.17,513.96,0.0 +gfx950,256,664,2048,2048,ck,3,0,15.8233,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,352.01,522.89,0.0 +gfx950,256,667,2048,512,ck,13,0,6.8242,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,204.98,604.04,0.0 +gfx950,256,667,2048,4096,ck,2,0,25.4018,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,440.54,545.34,0.0 +gfx950,256,671,2048,512,ck,13,0,6.7793,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,207.57,610.76,0.0 +gfx950,256,671,2048,4096,ck,2,0,25.4122,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,443.0,546.41,0.0 +gfx950,256,762,2048,2048,ck,2,0,15.2017,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,420.49,583.88,0.0 +gfx950,256,773,2048,128,ck,16,0,5.7653,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,70.3,611.81,0.0 +gfx950,256,773,2048,1024,ck,3,0,9.9334,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,326.39,609.55,0.0 +gfx950,256,822,2048,128,ck,6,0,5.8617,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,73.52,637.06,0.0 +gfx950,256,822,2048,1024,ck,18,0,10.3628,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,332.7,608.5,0.0 +gfx950,256,902,2048,2048,ck,2,0,15.4979,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,488.23,628.23,0.0 +gfx950,256,931,2048,512,ck,18,0,7.4235,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,263.01,719.15,0.0 +gfx950,256,931,2048,4096,ck,2,0,26.0911,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,598.66,613.82,0.0 +gfx950,256,961,2048,2048,ck,2,0,15.6255,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,515.92,646.3,0.0 +gfx950,256,1000,2048,2048,ck,2,0,15.7485,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,532.66,656.46,0.0 +gfx950,256,1001,2048,2048,ck,2,0,15.6989,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,534.88,658.93,0.0 +gfx950,256,1002,2048,2048,ck,2,0,15.8391,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,530.67,653.48,0.0 +gfx950,256,1003,2048,2048,ck,2,0,15.7897,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,532.86,655.92,0.0 +gfx950,256,1015,2048,2048,ck,3,0,16.9587,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,502.07,615.05,0.0 +gfx950,256,1023,2048,2048,ck,2,0,15.7482,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,544.92,665.45,0.0 +gfx950,256,1024,2048,128,ck,16,0,5.8407,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,91.92,785.44,0.0 +gfx950,256,1024,2048,512,ck,18,0,7.4642,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,287.7,772.64,0.0 +gfx950,256,1024,2048,1024,ck,3,0,9.9579,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,431.31,737.11,0.0 +gfx950,256,1024,2048,2048,ck,2,0,15.3319,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,560.27,683.92,0.0 +gfx950,256,1024,2048,4096,ck,2,0,25.7958,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,665.99,650.39,0.0 +gfx950,256,1025,2048,128,cktile,19,0,6.597,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_3,81.46,696.04,0.0 +gfx950,256,1025,2048,512,ck,13,0,8.9525,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,240.11,644.71,0.0 +gfx950,256,1025,2048,1024,ck,3,0,12.4335,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,345.77,590.75,0.0 +gfx950,256,1025,2048,2048,ck,3,0,19.0577,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,451.17,550.53,0.0 +gfx950,256,1025,2048,4096,ck,0,0,32.2845,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,532.66,519.92,0.0 +gfx950,256,1027,2048,128,ck,6,0,6.5456,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,82.26,702.79,0.0 +gfx950,256,1027,2048,512,ck,18,0,8.9521,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,240.59,645.77,0.0 +gfx950,256,1027,2048,1024,ck,3,0,12.6237,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,341.23,582.67,0.0 +gfx950,256,1027,2048,4096,ck,0,0,32.1312,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,536.25,522.91,0.0 +gfx950,256,1031,2048,512,ck,13,0,8.9537,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,241.48,647.71,0.0 +gfx950,256,1031,2048,4096,ck,0,0,32.1357,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,538.26,523.86,0.0 +gfx950,256,1032,2048,2048,ck,3,0,19.0775,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,453.78,552.22,0.0 +gfx950,256,1038,2048,2048,ck,3,0,19.0491,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,457.1,554.98,0.0 +gfx950,256,1039,2048,512,ck,13,0,8.8885,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,245.14,656.61,0.0 +gfx950,256,1039,2048,4096,ck,0,0,32.2929,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,539.79,523.34,0.0 +gfx950,256,1040,2048,512,ck,13,0,8.9441,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,243.85,653.04,0.0 +gfx950,256,1040,2048,4096,ck,0,0,32.1598,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,542.55,525.76,0.0 +gfx950,256,1041,2048,512,ck,13,0,8.9623,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,243.59,652.23,0.0 +gfx950,256,1041,2048,4096,ck,0,0,32.3151,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,540.46,523.49,0.0 +gfx950,256,1042,2048,128,ck,6,0,6.6291,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,82.41,703.5,0.0 +gfx950,256,1042,2048,1024,ck,3,0,12.5106,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,349.34,594.07,0.0 +gfx950,256,1051,2048,128,ck,6,0,6.6272,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,83.15,709.44,0.0 +gfx950,256,1051,2048,1024,ck,3,0,13.0515,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,337.76,572.98,0.0 +gfx950,256,1055,2048,128,ck,6,0,6.6338,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,83.38,711.28,0.0 +gfx950,256,1055,2048,1024,ck,3,0,13.1067,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,337.61,572.13,0.0 +gfx950,256,1057,2048,128,ck,6,0,6.6191,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,83.72,714.13,0.0 +gfx950,256,1057,2048,1024,ck,3,0,12.6695,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,349.93,592.68,0.0 +gfx950,256,1069,2048,128,ck,6,0,6.2065,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,90.3,769.77,0.0 +gfx950,256,1069,2048,1024,ck,3,0,12.6679,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,353.94,597.61,0.0 +gfx950,256,1072,2048,128,ck,6,0,6.168,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,91.12,776.63,0.0 +gfx950,256,1072,2048,512,ck,13,0,8.6539,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,259.78,691.98,0.0 +gfx950,256,1072,2048,1024,ck,3,0,12.1687,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,369.5,623.39,0.0 +gfx950,256,1072,2048,4096,ck,2,0,32.5769,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,552.08,527.07,0.0 +gfx950,256,1073,2048,512,ck,18,0,8.623,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,260.96,695.0,0.0 +gfx950,256,1073,2048,4096,ck,0,0,32.5998,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,552.21,526.95,0.0 +gfx950,256,1074,2048,128,ck,6,0,6.6566,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,84.59,720.9,0.0 +gfx950,256,1074,2048,1024,ck,3,0,12.3787,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,363.91,613.64,0.0 +gfx950,256,1077,2048,512,ck,13,0,9.0109,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,250.66,667.12,0.0 +gfx950,256,1077,2048,4096,ck,0,0,32.5662,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,554.84,528.5,0.0 +gfx950,256,1087,2048,2048,ck,2,0,19.7274,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,462.22,551.15,0.0 +gfx950,256,1091,2048,128,cktile,4,0,6.5966,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x128_intrawave_0x1x0_1,86.71,738.34,0.0 +gfx950,256,1091,2048,512,ck,18,0,8.9521,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,255.58,678.71,0.0 +gfx950,256,1091,2048,1024,ck,3,0,13.1565,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,347.81,583.98,0.0 +gfx950,256,1091,2048,4096,ck,0,0,32.6295,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,560.96,530.99,0.0 +gfx950,256,1128,2048,128,ck,11,0,6.6147,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_2x1_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,89.41,759.95,0.0 +gfx950,256,1128,2048,1024,ck,3,0,13.1877,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,358.76,596.96,0.0 +gfx950,256,1136,2048,128,ck,11,0,6.6779,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_2x1_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,89.19,757.81,0.0 +gfx950,256,1136,2048,1024,ck,3,0,12.4973,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,381.26,633.21,0.0 +gfx950,256,1255,2048,2048,ck,3,0,19.3804,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,543.21,614.28,0.0 +gfx950,256,1343,2048,512,ck,18,0,9.3665,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,300.7,772.66,0.0 +gfx950,256,1343,2048,4096,ck,0,0,32.9063,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,684.73,589.26,0.0 +gfx950,256,1351,2048,2048,ck,0,0,20.1533,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,562.34,619.99,0.0 +gfx950,256,1376,2048,2048,ck,0,0,20.2049,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,571.28,626.01,0.0 +gfx950,256,1380,2048,2048,ck,3,0,20.1807,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,573.63,627.98,0.0 +gfx950,256,1391,2048,2048,ck,0,0,20.3413,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,573.64,626.34,0.0 +gfx950,256,1514,2048,128,cktile,19,0,6.97,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_3,113.88,955.13,0.0 +gfx950,256,1514,2048,1024,ck,2,0,13.8,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,460.16,713.68,0.0 +gfx950,256,1688,2048,512,ck,2,0,10.1007,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,350.47,873.89,0.0 +gfx950,256,1688,2048,4096,ck,0,0,34.0748,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,831.11,652.0,0.0 +gfx950,256,1710,2048,2048,ck,0,0,20.6478,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,694.72,711.97,0.0 +gfx950,256,1808,2048,2048,ck,0,0,20.68,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,733.39,739.97,0.0 +gfx950,256,1809,2048,2048,ck,0,0,20.9096,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,725.74,732.14,0.0 +gfx950,256,1870,2048,2048,ck,0,0,21.1971,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,740.04,739.89,0.0 +gfx950,256,1967,2048,2048,ck,0,0,21.1886,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,778.74,768.32,0.0 +gfx950,256,1970,2048,2048,ck,2,0,22.7399,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,726.72,716.71,0.0 +gfx950,256,2000,2048,2048,ck,0,0,21.2252,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,790.44,776.54,0.0 +gfx950,256,2001,2048,2048,ck,0,0,21.1689,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,792.94,778.9,0.0 +gfx950,256,2016,2048,2048,ck,0,0,21.2743,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,794.92,779.37,0.0 +gfx950,256,2017,2048,128,cktile,6,0,7.3466,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,143.94,1195.38,0.0 +gfx950,256,2017,2048,512,ck,0,0,10.3246,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,409.7,1001.77,0.0 +gfx950,256,2017,2048,1024,ck,0,0,14.1869,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,596.32,875.75,0.0 +gfx950,256,2017,2048,2048,ck,2,0,22.7796,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,742.76,728.14,0.0 +gfx950,256,2017,2048,4096,ck,0,0,35.5629,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,951.54,700.5,0.0 +gfx950,256,2046,2048,2048,ck,0,0,21.2836,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,806.4,787.69,0.0 +gfx950,256,2048,2048,128,cktile,7,0,7.7402,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,138.72,1151.51,0.0 +gfx950,256,2048,2048,512,ck,0,0,10.8642,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,395.33,965.17,0.0 +gfx950,256,2048,2048,1024,ck,0,0,14.0791,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,610.12,893.73,0.0 +gfx950,256,2048,2048,2048,ck,0,0,20.4767,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,839.0,819.33,0.0 +gfx950,256,2048,2048,4096,ck,0,0,34.6324,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,992.13,726.66,0.0 +gfx950,256,2050,2048,512,ck,18,0,11.8257,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,363.54,887.47,0.0 +gfx950,256,2050,2048,2048,ck,2,0,28.1498,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,610.9,596.43,0.0 +gfx950,256,2050,2048,4096,ck,2,0,48.0883,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,715.21,523.67,0.0 +gfx950,256,2063,2048,512,ck,18,0,11.9421,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,362.28,883.84,0.0 +gfx950,256,2063,2048,4096,ck,2,0,48.5755,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,712.53,520.61,0.0 +gfx950,256,2064,2048,512,ck,18,0,12.0187,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,360.15,878.59,0.0 +gfx950,256,2064,2048,4096,ck,2,0,48.4741,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,714.36,521.86,0.0 +gfx950,256,2081,2048,2048,ck,2,0,28.3279,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,616.24,599.41,0.0 +gfx950,256,2095,2048,2048,ck,2,0,28.2054,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,623.08,605.06,0.0 +gfx950,256,2099,2048,512,ck,18,0,11.8188,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,372.45,907.09,0.0 +gfx950,256,2099,2048,4096,ck,2,0,48.504,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,726.03,527.45,0.0 +gfx950,256,2151,2048,128,cktile,6,0,8.0136,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,140.73,1166.51,0.0 +gfx950,256,2151,2048,1024,ck,2,0,18.6453,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,483.87,703.14,0.0 +gfx950,256,2154,2048,2048,ck,2,0,28.868,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,625.92,603.73,0.0 +gfx950,256,2159,2048,512,ck,18,0,13.0787,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,346.19,840.85,0.0 +gfx950,256,2159,2048,2048,ck,2,0,28.6449,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,632.26,609.5,0.0 +gfx950,256,2159,2048,4096,ck,0,0,49.7859,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,727.56,523.75,0.0 +gfx950,256,2160,2048,512,ck,18,0,12.9748,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,349.13,847.94,0.0 +gfx950,256,2160,2048,4096,ck,0,0,49.4639,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,732.63,527.32,0.0 +gfx950,256,2162,2048,2048,ck,2,0,28.8572,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,628.48,605.66,0.0 +gfx950,256,2165,2048,2048,ck,2,0,28.6466,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,633.98,610.76,0.0 +gfx950,256,2251,2048,2048,ck,2,0,28.4053,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,664.76,634.55,0.0 +gfx950,256,2252,2048,2048,ck,2,0,28.4735,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,663.46,633.24,0.0 +gfx950,256,2255,2048,2048,ck,2,0,28.5118,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,663.46,633.04,0.0 +gfx950,256,2313,2048,512,ck,18,0,13.4024,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,361.93,873.49,0.0 +gfx950,256,2313,2048,4096,ck,2,0,49.4187,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,785.24,553.17,0.0 +gfx950,256,2317,2048,2048,ck,2,0,28.6792,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,677.72,642.62,0.0 +gfx950,256,2325,2048,512,ck,13,0,13.3168,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_2x1_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,366.14,883.26,0.0 +gfx950,256,2325,2048,4096,ck,2,0,49.1754,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,793.22,557.9,0.0 +gfx950,256,2332,2048,128,cktile,6,0,8.9983,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,135.87,1123.82,0.0 +gfx950,256,2332,2048,1024,ck,2,0,18.7106,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,522.76,750.22,0.0 +gfx950,256,2341,2048,128,cktile,6,0,8.7391,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,140.44,1161.51,0.0 +gfx950,256,2341,2048,1024,ck,2,0,18.662,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,526.14,754.64,0.0 +gfx950,256,2694,2048,2048,ck,2,0,29.9208,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,755.29,693.37,0.0 +gfx950,256,2715,2048,2048,ck,2,0,29.4004,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,774.65,710.03,0.0 +gfx950,256,2819,2048,2048,ck,2,0,29.8864,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,791.25,719.87,0.0 +gfx950,256,2853,2048,2048,ck,2,0,29.996,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,797.86,724.2,0.0 +gfx950,256,2914,2048,2048,ck,2,0,30.0008,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,814.79,736.58,0.0 +gfx950,256,2974,2048,2048,ck,2,0,30.4312,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,819.81,738.27,0.0 +gfx950,256,3000,2048,2048,ck,2,0,30.7612,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,818.1,735.55,0.0 +gfx950,256,3001,2048,2048,ck,2,0,30.688,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,820.33,737.5,0.0 +gfx950,256,3072,2048,128,cktile,20,0,9.7999,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_4,164.35,1350.86,0.0 +gfx950,256,3072,2048,512,ck,18,0,13.8348,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,465.67,1098.99,0.0 +gfx950,256,3072,2048,1024,ck,2,0,20.586,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,625.91,865.92,0.0 +gfx950,256,3072,2048,2048,ck,0,0,30.2228,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,852.66,763.29,0.0 +gfx950,256,3072,2048,4096,ck,0,0,50.8509,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1013.54,659.86,0.0 +gfx950,256,3073,2048,128,cktile,24,0,11.1396,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_1,144.63,1188.78,0.0 +gfx950,256,3073,2048,512,ck,18,0,15.1309,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,425.92,1005.16,0.0 +gfx950,256,3073,2048,1024,ck,0,0,21.2594,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,606.28,838.73,0.0 +gfx950,256,3073,2048,2048,ck,0,0,30.994,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,831.72,744.49,0.0 +gfx950,256,3073,2048,4096,ck,0,0,52.3318,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,985.18,641.34,0.0 +gfx950,256,3077,2048,2048,ck,0,0,31.1621,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,828.31,741.27,0.0 +gfx950,256,3078,2048,2048,ck,0,0,31.1404,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,829.15,741.98,0.0 +gfx950,256,3086,2048,2048,ck,0,0,31.4711,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,822.57,735.74,0.0 +gfx950,256,3089,2048,128,cktile,25,0,11.0796,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_3,146.17,1201.31,0.0 +gfx950,256,3089,2048,1024,ck,0,0,21.3608,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,606.54,838.58,0.0 +gfx950,256,3089,2048,2048,ck,0,0,31.3852,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,825.63,738.35,0.0 +gfx950,256,3110,2048,128,ck,16,0,11.3975,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,143.06,1175.59,0.0 +gfx950,256,3110,2048,1024,ck,0,0,21.2176,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,614.79,849.31,0.0 +gfx950,256,3118,2048,2048,ck,0,0,30.8005,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,849.2,758.15,0.0 +gfx950,256,3137,2048,128,cktile,24,0,11.2193,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_1,146.59,1204.43,0.0 +gfx950,256,3137,2048,1024,ck,0,0,21.2686,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,618.64,853.77,0.0 +gfx950,256,3138,2048,2048,ck,0,0,31.2392,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,842.64,751.43,0.0 +gfx950,256,3154,2048,2048,ck,0,0,31.6203,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,836.73,745.49,0.0 +gfx950,256,3182,2048,512,ck,18,0,15.3991,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,433.35,1020.27,0.0 +gfx950,256,3182,2048,4096,ck,0,0,53.919,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,990.1,639.02,0.0 +gfx950,256,3183,2048,512,ck,18,0,15.3379,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,435.21,1024.64,0.0 +gfx950,256,3183,2048,4096,ck,0,0,53.421,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,999.64,645.13,0.0 +gfx950,256,3289,2048,2048,ck,0,0,31.8535,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,866.16,766.07,0.0 +gfx950,256,3437,2048,2048,ck,0,0,32.1421,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,897.01,787.48,0.0 +gfx950,256,3613,2048,128,cktile,10,0,11.4416,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,165.56,1356.76,0.0 +gfx950,256,3613,2048,1024,ck,0,0,21.9031,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,691.87,940.31,0.0 +gfx950,256,3617,2048,2048,ck,0,0,32.7869,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,925.42,805.72,0.0 +gfx950,256,3620,2048,2048,ck,0,0,32.8325,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,924.9,805.17,0.0 +gfx950,256,3669,2048,2048,ck,0,0,32.7057,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,941.05,817.49,0.0 +gfx950,256,3824,2048,2048,ck,0,0,33.6353,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,953.7,823.21,0.0 +gfx950,256,3847,2048,512,ck,18,0,17.0623,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,472.84,1100.41,0.0 +gfx950,256,3847,2048,4096,ck,0,0,59.3735,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1087.05,672.07,0.0 +gfx950,256,3858,2048,2048,ck,0,0,34.2865,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,943.91,813.67,0.0 +gfx950,256,3906,2048,2048,ck,0,0,34.2813,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,955.8,822.39,0.0 +gfx950,256,4000,2048,2048,ck,0,0,34.2409,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,979.95,840.23,0.0 +gfx950,256,4004,2048,2048,ck,0,0,34.6357,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,969.75,831.36,0.0 +gfx950,256,4096,2048,128,cktile,25,0,11.7529,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_3,182.72,1494.41,0.0 +gfx950,256,4096,2048,512,ck,18,0,17.0446,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,503.97,1168.87,0.0 +gfx950,256,4096,2048,1024,ck,0,0,22.5872,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,760.6,1021.32,0.0 +gfx950,256,4096,2048,4096,ck,0,0,58.1923,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1180.9,720.77,0.0 +gfx950,256,4097,2048,128,cktile,6,0,12.4091,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,173.1,1415.72,0.0 +gfx950,256,4097,2048,1024,cktile,30,0,27.7613,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,618.99,831.15,0.0 +gfx950,256,4100,2048,2048,cktile,32,0,40.2915,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,853.61,729.3,0.0 +gfx950,256,4108,2048,2048,cktile,12,0,40.3457,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,854.13,729.54,0.0 +gfx950,256,4109,2048,128,cktile,7,0,12.3771,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,174.06,1423.48,0.0 +gfx950,256,4109,2048,1024,cktile,31,0,27.8044,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,619.84,832.07,0.0 +gfx950,256,4111,2048,512,ck,18,0,18.3569,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,469.65,1089.08,0.0 +gfx950,256,4111,2048,4096,cktile,29,0,62.8737,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1096.98,669.05,0.0 +gfx950,256,4121,2048,2048,cktile,12,0,41.1442,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,840.2,717.32,0.0 +gfx950,256,4123,2048,2048,cktile,32,0,40.5906,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,852.07,727.41,0.0 +gfx950,256,4141,2048,512,ck,18,0,18.4379,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,471.0,1091.79,0.0 +gfx950,256,4141,2048,4096,cktile,29,0,63.1994,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1099.29,669.49,0.0 +gfx950,256,4142,2048,512,ck,18,0,18.4741,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,470.19,1089.9,0.0 +gfx950,256,4142,2048,4096,cktile,27,0,63.0746,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1101.73,670.95,0.0 +gfx950,256,4143,2048,128,cktile,20,0,12.4497,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_4,174.47,1426.72,0.0 +gfx950,256,4143,2048,1024,cktile,30,0,27.8358,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,624.27,837.39,0.0 +gfx950,256,4149,2048,2048,cktile,31,0,40.8706,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,851.57,726.34,0.0 +gfx950,256,4204,2048,2048,cktile,30,0,40.5177,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,870.38,741.0,0.0 +gfx950,256,4223,2048,2048,cktile,31,0,40.9247,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,865.62,736.48,0.0 +gfx950,256,4229,2048,2048,cktile,12,0,40.7843,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,869.83,739.92,0.0 +gfx950,256,4345,2048,128,cktile,19,0,12.8748,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_3,176.94,1445.88,0.0 +gfx950,256,4345,2048,512,ck,18,0,18.7036,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,487.19,1126.54,0.0 +gfx950,256,4345,2048,1024,ck,2,0,28.7828,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,633.16,845.77,0.0 +gfx950,256,4345,2048,4096,cktile,11,0,63.5458,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1147.16,692.14,0.0 +gfx950,256,4379,2048,2048,cktile,12,0,41.0321,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,895.24,757.92,0.0 +gfx950,256,4410,2048,2048,cktile,12,0,41.1746,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,898.46,759.92,0.0 +gfx950,256,4413,2048,128,cktile,7,0,12.6383,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,183.07,1495.66,0.0 +gfx950,256,4413,2048,1024,cktile,31,0,28.0829,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,659.1,879.24,0.0 +gfx950,256,4498,2048,2048,cktile,12,0,40.8558,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,923.54,779.08,0.0 +gfx950,256,4585,2048,2048,cktile,30,0,41.2823,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,931.68,783.98,0.0 +gfx950,256,4586,2048,2048,cktile,31,0,41.3619,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,930.09,782.62,0.0 +gfx950,256,4723,2048,2048,cktile,12,0,41.3863,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,957.31,802.5,0.0 +gfx950,256,4743,2048,2048,cktile,12,0,41.3044,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,963.27,807.06,0.0 +gfx950,256,4822,2048,2048,cktile,32,0,41.6608,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,970.93,811.81,0.0 +gfx950,256,4898,2048,2048,cktile,31,0,41.5369,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,989.18,825.47,0.0 +gfx950,256,4936,2048,2048,cktile,12,0,41.429,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,999.45,833.26,0.0 +gfx950,256,4992,2048,2048,cktile,30,0,41.3855,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1011.85,842.45,0.0 +gfx950,256,5000,2048,2048,cktile,32,0,41.4236,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1012.54,842.86,0.0 +gfx950,256,5001,2048,2048,cktile,30,0,41.6517,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1007.2,838.39,0.0 +gfx950,256,5002,2048,2048,cktile,31,0,41.6954,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1006.34,837.66,0.0 +gfx950,256,5003,2048,2048,cktile,31,0,41.6567,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1007.48,838.59,0.0 +gfx950,256,5004,2048,2048,cktile,30,0,41.5666,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1009.86,840.55,0.0 +gfx950,256,5005,2048,2048,cktile,31,0,41.7385,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1005.91,837.24,0.0 +gfx950,256,5006,2048,2048,cktile,12,0,41.6095,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1009.23,839.98,0.0 +gfx950,256,5010,2048,2048,cktile,12,0,41.8076,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1005.25,836.59,0.0 +gfx950,256,5011,2048,2048,cktile,12,0,41.8406,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1004.65,836.08,0.0 +gfx950,256,5012,2048,2048,cktile,32,0,41.5702,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1011.39,841.66,0.0 +gfx950,256,5026,2048,2048,cktile,30,0,41.8113,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1008.37,838.87,0.0 +gfx950,256,5028,2048,2048,cktile,12,0,41.6495,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1012.69,842.42,0.0 +gfx950,256,5030,2048,2048,cktile,32,0,41.8802,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1007.51,838.07,0.0 +gfx950,256,5031,2048,2048,cktile,12,0,41.7855,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1009.99,840.12,0.0 +gfx950,256,5060,2048,2048,cktile,31,0,41.5787,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1020.87,848.58,0.0 +gfx950,256,5062,2048,2048,cktile,31,0,41.9044,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1013.33,842.28,0.0 +gfx950,256,5063,2048,2048,cktile,32,0,41.6206,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1020.44,848.17,0.0 +gfx950,256,5091,2048,2048,cktile,30,0,41.608,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1026.4,852.56,0.0 +gfx950,256,5105,2048,512,ck,18,0,20.6958,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,517.3,1187.31,0.0 +gfx950,256,5105,2048,2048,cktile,32,0,41.7753,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1025.1,851.21,0.0 +gfx950,256,5105,2048,4096,cktile,27,0,65.0105,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1317.44,772.32,0.0 +gfx950,256,5120,2048,2048,cktile,30,0,41.7457,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1028.84,854.02,0.0 +gfx950,256,5121,2048,128,cktile,20,0,14.224,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_4,188.76,1539.18,0.0 +gfx950,256,5121,2048,512,ck,18,0,21.7571,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,493.61,1132.79,0.0 +gfx950,256,5121,2048,1024,cktile,31,0,28.4627,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,754.64,994.87,0.0 +gfx950,256,5121,2048,4096,cktile,29,0,65.012,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1321.54,774.32,0.0 +gfx950,256,5122,2048,2048,cktile,31,0,41.9704,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1023.73,849.74,0.0 +gfx950,256,5123,2048,128,cktile,20,0,14.2715,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_4,188.2,1534.65,0.0 +gfx950,256,5123,2048,512,ck,18,0,21.6433,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,496.4,1139.17,0.0 +gfx950,256,5123,2048,1024,cktile,32,0,28.386,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,756.97,997.92,0.0 +gfx950,256,5123,2048,2048,cktile,12,0,41.8478,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1026.93,852.37,0.0 +gfx950,256,5123,2048,4096,cktile,28,0,65.0928,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1320.42,773.61,0.0 +gfx950,256,5124,2048,2048,cktile,32,0,41.5034,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1035.66,859.6,0.0 +gfx950,256,5127,2048,2048,cktile,31,0,41.6749,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1032.0,856.5,0.0 +gfx950,256,5131,2048,512,ck,18,0,21.5867,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,498.48,1143.86,0.0 +gfx950,256,5131,2048,4096,cktile,29,0,65.3686,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1316.9,771.35,0.0 +gfx950,256,5133,2048,128,cktile,7,0,14.3216,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,187.91,1532.23,0.0 +gfx950,256,5133,2048,1024,cktile,12,0,28.5403,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,754.35,994.32,0.0 +gfx950,256,5177,2048,2048,cktile,12,0,42.1067,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1031.38,855.01,0.0 +gfx950,256,5183,2048,2048,cktile,31,0,42.1044,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1032.63,855.94,0.0 +gfx950,256,5185,2048,128,cktile,7,0,14.4713,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,187.85,1531.55,0.0 +gfx950,256,5185,2048,1024,cktile,32,0,28.5217,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,762.49,1004.3,0.0 +gfx950,256,5191,2048,128,cktile,20,0,14.3689,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_4,189.41,1544.23,0.0 +gfx950,256,5191,2048,1024,cktile,31,0,28.5546,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,762.49,1004.22,0.0 +gfx950,256,5252,2048,2048,cktile,12,0,41.8846,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1051.87,870.55,0.0 +gfx950,256,5255,2048,2048,cktile,31,0,42.3697,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1040.42,861.02,0.0 +gfx950,256,5361,2048,2048,cktile,12,0,42.2586,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1064.19,878.69,0.0 +gfx950,256,5427,2048,512,ck,18,0,21.6172,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,526.49,1205.35,0.0 +gfx950,256,5427,2048,2048,cktile,31,0,42.5345,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1070.31,882.53,0.0 +gfx950,256,5427,2048,4096,cktile,28,0,66.0641,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1378.21,799.93,0.0 +gfx950,256,5471,2048,2048,cktile,31,0,42.3016,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1084.93,893.78,0.0 +gfx950,256,5534,2048,2048,cktile,12,0,42.3209,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1096.92,902.51,0.0 +gfx950,256,5564,2048,2048,cktile,31,0,42.1818,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1106.5,909.86,0.0 +gfx950,256,5671,2048,2048,cktile,32,0,42.4883,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1119.64,918.77,0.0 +gfx950,256,5927,2048,2048,cktile,31,0,42.9201,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1158.41,946.17,0.0 +gfx950,256,5954,2048,2048,cktile,31,0,42.9165,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1163.79,950.12,0.0 +gfx950,256,6000,2048,2048,cktile,12,0,42.8017,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1175.93,959.27,0.0 +gfx950,256,6002,2048,2048,cktile,30,0,43.2385,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1164.44,949.86,0.0 +gfx950,256,6113,2048,512,cktile,30,0,22.3656,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,573.2,1306.35,0.0 +gfx950,256,6113,2048,4096,cktile,29,0,67.9741,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1508.8,860.13,0.0 +gfx950,256,6146,2048,128,cktile,6,0,16.2615,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,198.15,1612.57,0.0 +gfx950,256,6146,2048,1024,ck,0,0,37.9007,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,680.15,885.6,0.0 +gfx950,256,6146,2048,2048,ck,0,0,58.3038,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,884.27,719.6,0.0 +gfx950,256,6147,2048,128,cktile,20,0,16.2358,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_4,198.5,1615.39,0.0 +gfx950,256,6147,2048,1024,ck,0,0,37.7839,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,682.36,888.47,0.0 +gfx950,256,6155,2048,128,cktile,19,0,16.2266,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_3,198.87,1618.38,0.0 +gfx950,256,6155,2048,1024,ck,0,0,37.8511,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,682.04,887.97,0.0 +gfx950,256,6194,2048,2048,ck,0,0,58.327,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,890.82,724.37,0.0 +gfx950,256,6197,2048,512,ck,18,0,24.7318,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,525.48,1197.02,0.0 +gfx950,256,6197,2048,4096,ck,0,0,99.9766,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1039.93,591.68,0.0 +gfx950,256,6227,2048,2048,ck,0,0,58.0378,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,900.03,731.47,0.0 +gfx950,256,6229,2048,2048,ck,0,0,57.9478,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,901.72,732.82,0.0 +gfx950,256,6234,2048,2048,ck,0,0,57.4538,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,910.2,739.66,0.0 +gfx950,256,6246,2048,2048,ck,0,0,58.2189,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,899.97,731.2,0.0 +gfx950,256,6304,2048,2048,ck,0,0,58.3983,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,905.54,735.06,0.0 +gfx950,256,6314,2048,2048,ck,0,0,58.6431,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,903.19,733.04,0.0 +gfx950,256,6318,2048,2048,ck,0,0,57.9474,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,914.61,742.26,0.0 +gfx950,256,6341,2048,2048,ck,0,0,58.4499,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,910.05,738.3,0.0 +gfx950,256,6377,2048,128,cktile,20,0,16.4216,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_4,203.6,1656.27,0.0 +gfx950,256,6377,2048,1024,ck,0,0,38.2626,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,699.04,908.13,0.0 +gfx950,256,6377,2048,2048,ck,0,0,58.4111,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,915.82,742.57,0.0 +gfx950,256,6401,2048,128,cktile,6,0,16.4272,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,204.29,1661.88,0.0 +gfx950,256,6401,2048,1024,ck,0,0,38.0151,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,706.24,917.27,0.0 +gfx950,256,6409,2048,512,ck,18,0,25.3228,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,530.77,1207.66,0.0 +gfx950,256,6409,2048,4096,ck,0,0,101.1523,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1063.0,601.97,0.0 +gfx950,256,6476,2048,2048,ck,0,0,58.5015,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,928.6,751.82,0.0 +gfx950,256,6483,2048,2048,ck,0,0,58.0731,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,936.46,758.11,0.0 +gfx950,256,6492,2048,2048,ck,0,0,58.1539,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,936.46,758.01,0.0 +gfx950,256,6519,2048,2048,ck,0,0,57.9027,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,944.43,764.16,0.0 +gfx950,256,6609,2048,2048,ck,0,0,58.5668,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,946.62,764.94,0.0 +gfx950,256,6811,2048,2048,ck,0,0,59.3478,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,962.71,775.78,0.0 +gfx950,256,6813,2048,2048,ck,0,0,58.7865,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,972.19,783.4,0.0 +gfx950,256,6818,2048,2048,ck,0,0,60.0023,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,953.19,768.04,0.0 +gfx950,256,6830,2048,2048,ck,0,0,59.3815,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,964.85,777.31,0.0 +gfx950,256,7001,2048,2048,ck,0,0,59.8127,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,981.88,789.27,0.0 +gfx950,256,7025,2048,2048,ck,0,0,59.4919,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,990.55,796.01,0.0 +gfx950,256,7121,2048,128,cktile,7,0,18.0812,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,206.48,1678.05,0.0 +gfx950,256,7121,2048,512,ck,18,0,26.7927,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,557.38,1263.86,0.0 +gfx950,256,7121,2048,1024,ck,0,0,39.3557,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,758.92,979.7,0.0 +gfx950,256,7121,2048,2048,ck,0,0,60.2027,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,992.24,796.4,0.0 +gfx950,256,7121,2048,4096,ck,0,0,104.6095,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1142.06,637.84,0.0 +gfx950,256,7168,2048,512,ck,18,0,26.7376,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,562.22,1274.56,0.0 +gfx950,256,7168,2048,4096,ck,0,0,102.2687,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1175.91,656.2,0.0 +gfx950,256,7169,2048,128,cktile,6,0,18.2248,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,206.24,1675.96,0.0 +gfx950,256,7169,2048,512,ck,18,0,28.1128,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,534.79,1212.38,0.0 +gfx950,256,7169,2048,1024,ck,0,0,39.2656,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,765.78,988.2,0.0 +gfx950,256,7169,2048,2048,ck,0,0,60.3125,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,997.11,799.84,0.0 +gfx950,256,7169,2048,4096,ck,0,0,105.5216,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1139.82,636.05,0.0 +gfx950,256,7170,2048,128,cktile,19,0,18.1554,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_3,207.05,1682.6,0.0 +gfx950,256,7170,2048,1024,ck,0,0,39.4278,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,762.74,984.27,0.0 +gfx950,256,7172,2048,512,ck,18,0,28.072,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,535.79,1214.63,0.0 +gfx950,256,7172,2048,4096,ck,0,0,104.2444,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1154.27,644.08,0.0 +gfx950,256,7176,2048,128,cktile,19,0,18.1987,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_3,206.73,1679.99,0.0 +gfx950,256,7176,2048,512,ck,18,0,27.9024,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,539.35,1222.68,0.0 +gfx950,256,7176,2048,1024,ck,0,0,39.1867,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,768.08,991.11,0.0 +gfx950,256,7176,2048,4096,ck,0,0,105.6936,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1139.08,635.56,0.0 +gfx950,256,7177,2048,128,cktile,20,0,18.1774,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_4,207.01,1682.19,0.0 +gfx950,256,7177,2048,512,ck,0,0,28.1975,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,533.78,1210.04,0.0 +gfx950,256,7177,2048,1024,ck,0,0,38.9082,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,773.68,998.33,0.0 +gfx950,256,7177,2048,2048,ck,0,0,60.8254,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,989.8,793.91,0.0 +gfx950,256,7177,2048,4096,ck,0,0,105.9604,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1136.37,634.03,0.0 +gfx950,256,7178,2048,512,ck,18,0,27.8668,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,540.19,1224.57,0.0 +gfx950,256,7178,2048,4096,ck,0,0,105.9377,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1136.77,634.25,0.0 +gfx950,256,7179,2048,512,ck,18,0,28.004,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,537.62,1218.73,0.0 +gfx950,256,7179,2048,4096,ck,0,0,104.8177,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1149.08,641.1,0.0 +gfx950,256,7183,2048,512,ck,0,0,27.8558,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,540.78,1225.88,0.0 +gfx950,256,7183,2048,4096,ck,0,0,105.5697,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1141.53,636.85,0.0 +gfx950,256,7184,2048,512,ck,18,0,27.9688,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,538.67,1221.09,0.0 +gfx950,256,7184,2048,4096,ck,0,0,104.8921,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1149.06,641.04,0.0 +gfx950,256,7185,2048,128,cktile,7,0,18.2539,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,206.37,1676.99,0.0 +gfx950,256,7185,2048,1024,ck,0,0,39.1134,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,770.48,994.14,0.0 +gfx950,256,7206,2048,128,cktile,7,0,18.2282,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,207.26,1684.22,0.0 +gfx950,256,7206,2048,1024,ck,0,0,39.4453,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,766.23,988.5,0.0 +gfx950,256,7209,2048,512,ck,18,0,27.7406,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,544.99,1235.29,0.0 +gfx950,256,7209,2048,4096,ck,0,0,105.7793,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1143.39,637.6,0.0 +gfx950,256,7210,2048,512,ck,18,0,28.1672,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,536.81,1216.74,0.0 +gfx950,256,7210,2048,4096,ck,0,0,105.4765,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1146.83,639.51,0.0 +gfx950,256,7211,2048,512,ck,18,0,28.0061,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,539.97,1223.91,0.0 +gfx950,256,7211,2048,4096,ck,0,0,105.6689,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1144.9,638.42,0.0 +gfx950,256,7217,2048,128,cktile,7,0,18.2628,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,207.19,1683.57,0.0 +gfx950,256,7217,2048,512,ck,0,0,28.1953,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,536.8,1216.67,0.0 +gfx950,256,7217,2048,1024,ck,0,0,39.0005,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,776.15,1001.22,0.0 +gfx950,256,7217,2048,2048,ck,0,0,61.2926,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,987.73,791.87,0.0 +gfx950,256,7217,2048,4096,ck,0,0,104.3465,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1160.38,646.98,0.0 +gfx950,256,7223,2048,2048,ck,0,0,60.8107,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,996.39,798.75,0.0 +gfx950,256,7238,2048,2048,ck,0,0,60.4754,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1003.99,804.7,0.0 +gfx950,256,7246,2048,2048,ck,0,0,60.2239,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1009.3,808.88,0.0 +gfx950,256,7249,2048,2048,ck,0,0,60.7059,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1001.7,802.76,0.0 +gfx950,256,7257,2048,128,cktile,7,0,18.2852,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_2,208.08,1690.75,0.0 +gfx950,256,7257,2048,1024,ck,0,0,39.2481,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,775.53,1000.12,0.0 +gfx950,256,7265,2048,128,cktile,6,0,18.2351,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_1,208.88,1697.25,0.0 +gfx950,256,7265,2048,1024,ck,0,0,39.0553,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,780.22,1006.11,0.0 +gfx950,256,7273,2048,512,ck,18,0,27.9923,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,544.89,1234.72,0.0 +gfx950,256,7273,2048,4096,ck,0,0,107.5254,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1134.81,632.12,0.0 +gfx950,256,7274,2048,512,ck,18,0,28.1412,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,542.08,1228.35,0.0 +gfx950,256,7274,2048,4096,ck,0,0,104.947,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1162.85,647.73,0.0 +gfx950,256,7316,2048,2048,ck,0,0,60.3891,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1016.26,813.79,0.0 +gfx950,256,7391,2048,512,ck,0,0,28.3506,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,546.73,1238.29,0.0 +gfx950,256,7391,2048,4096,ck,0,0,106.275,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1166.79,648.65,0.0 +gfx950,256,7393,2048,128,cktile,20,0,18.3695,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_4,211.01,1714.26,0.0 +gfx950,256,7393,2048,512,ck,18,0,28.6184,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,541.76,1227.03,0.0 +gfx950,256,7393,2048,1024,ck,0,0,39.5241,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,784.55,1010.76,0.0 +gfx950,256,7393,2048,2048,ck,0,0,61.4671,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1008.95,807.21,0.0 +gfx950,256,7393,2048,4096,ck,0,0,104.2778,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1189.46,661.23,0.0 +gfx950,256,7461,2048,128,cktile,19,0,18.4218,a8w8_blockscale_cktile_32x128x128_1x4x1_16x16x64_intrawave_0x1x0_3,212.34,1724.99,0.0 +gfx950,256,7461,2048,1024,ck,0,0,39.6618,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,789.01,1016.03,0.0 +gfx950,256,7491,2048,2048,ck,0,0,60.7595,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1034.23,826.52,0.0 +gfx950,256,7558,2048,2048,ck,0,0,62.0659,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1021.51,815.76,0.0 +gfx950,256,7634,2048,128,cktile,10,0,18.4442,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_2,217.0,1762.51,0.0 +gfx950,256,7634,2048,1024,ck,0,0,39.4317,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,812.02,1044.42,0.0 +gfx950,256,7650,2048,2048,ck,0,0,61.1251,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1049.86,837.56,0.0 +gfx950,256,7715,2048,512,ck,0,0,28.6077,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,565.57,1279.35,0.0 +gfx950,256,7715,2048,4096,ck,0,0,108.6967,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1190.8,658.62,0.0 +gfx950,256,7753,2048,512,ck,0,0,28.6716,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,567.08,1282.61,0.0 +gfx950,256,7753,2048,2048,ck,0,0,62.4976,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1040.63,829.29,0.0 +gfx950,256,7753,2048,4096,ck,0,0,109.1364,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1191.85,658.82,0.0 +gfx950,256,7774,2048,2048,ck,0,0,61.9552,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1052.58,838.63,0.0 +gfx950,256,7815,2048,2048,ck,0,0,61.7436,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1061.76,845.59,0.0 +gfx950,256,7819,2048,512,ck,0,0,28.5876,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,573.59,1297.01,0.0 +gfx950,256,7819,2048,4096,ck,0,0,108.7384,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1206.39,666.2,0.0 +gfx950,256,7834,2048,2048,ck,0,0,62.4948,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1051.55,837.29,0.0 +gfx950,256,7836,2048,2048,ck,0,0,61.7956,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1063.72,846.96,0.0 +gfx950,256,7871,2048,2048,ck,0,0,61.688,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1070.33,851.93,0.0 +gfx950,256,7899,2048,2048,ck,0,0,62.48,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1060.53,843.88,0.0 +gfx950,256,7913,2048,2048,ck,0,0,61.7108,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1075.65,855.79,0.0 +gfx950,256,7930,2048,2048,ck,0,0,62.4132,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1065.83,847.84,0.0 +gfx950,256,7961,2048,2048,ck,0,0,63.2516,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1055.81,839.61,0.0 +gfx950,256,7963,2048,2048,ck,0,0,63.3724,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1054.06,838.2,0.0 +gfx950,256,7974,2048,2048,ck,0,0,63.9584,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1045.85,831.58,0.0 +gfx950,256,8016,2048,2048,ck,0,0,63.1496,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1064.82,846.32,0.0 +gfx950,256,8017,2048,2048,ck,0,0,63.5772,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1057.79,840.72,0.0 +gfx950,256,8099,2048,512,ck,0,0,29.6522,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,572.8,1293.96,0.0 +gfx950,256,8099,2048,4096,ck,0,0,110.7685,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1226.69,674.7,0.0 +gfx950,256,8103,2048,2048,ck,0,0,64.9912,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1045.88,830.56,0.0 +gfx950,256,8128,2048,2048,ck,0,0,63.7684,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1069.22,848.9,0.0 +gfx950,256,8129,2048,128,cktile,26,0,18.6432,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_4,228.61,1855.85,0.0 +gfx950,256,8129,2048,512,ck,0,0,29.7984,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,572.1,1292.25,0.0 +gfx950,256,8129,2048,1024,ck,0,0,41.547,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,820.65,1052.25,0.0 +gfx950,256,8129,2048,2048,ck,0,0,64.2288,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1061.69,842.91,0.0 +gfx950,256,8129,2048,4096,ck,0,0,111.2916,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1225.45,673.74,0.0 +gfx950,256,8150,2048,2048,ck,0,0,64.47,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1060.45,841.75,0.0 +gfx950,256,8184,2048,2048,ck,0,0,63.8003,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1076.05,853.86,0.0 +gfx950,256,8185,2048,2048,ck,0,0,64.74,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1060.56,841.57,0.0 +gfx950,256,8188,2048,512,ck,0,0,29.7506,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,577.18,1303.47,0.0 +gfx950,256,8188,2048,4096,ck,0,0,111.9031,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1227.6,674.38,0.0 +gfx950,256,8192,2048,128,cktile,25,0,18.6204,a8w8_blockscale_cktile_128x128x128_2x2x1_16x16x128_intrawave_0x1x0_3,230.66,1872.42,0.0 +gfx950,256,8192,2048,512,ck,0,0,29.4494,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,583.37,1317.42,0.0 +gfx950,256,8192,2048,1024,ck,0,0,40.6381,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,845.51,1083.72,0.0 +gfx950,256,8192,2048,2048,ck,0,0,63.3251,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1085.19,861.05,0.0 +gfx950,256,8192,2048,4096,ck,0,0,108.3607,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1268.35,696.72,0.0 +gfx950,256,1,2560,2048,ck,8,2,6.2595,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.68,838.73,0.0 +gfx950,256,2,2560,2048,ck,8,3,6.2748,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.34,837.83,0.0 +gfx950,256,4,2560,2048,ck,8,3,6.6483,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.31,792.92,0.0 +gfx950,256,8,2560,2048,ck,8,2,6.8899,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.18,769.27,0.0 +gfx950,256,16,2560,2048,ck,8,2,7.0825,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,23.69,756.45,0.0 +gfx950,256,32,2560,2048,ck,8,3,7.7553,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,43.27,705.61,0.0 +gfx950,256,48,2560,2048,ck,8,2,8.0392,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,62.61,694.96,0.0 +gfx950,256,64,2560,2048,ck,8,0,8.5954,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,78.08,663.34,0.0 +gfx950,256,80,2560,2048,ck,8,0,8.5765,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,97.81,678.17,0.0 +gfx950,256,96,2560,2048,ck,8,0,8.2734,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,121.67,716.88,0.0 +gfx950,256,112,2560,2048,ck,7,0,9.9327,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,118.24,608.67,0.0 +gfx950,256,128,2560,2048,ck,7,0,9.919,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,135.31,621.07,0.0 +gfx950,256,256,2560,2048,ck,18,0,9.8047,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,273.78,721.89,0.0 +gfx950,256,512,2560,2048,ck,2,0,14.9962,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,358.0,594.34,0.0 +gfx950,256,588,2560,2048,ck,2,0,15.2743,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,403.66,619.19,0.0 +gfx950,256,773,2560,2048,ck,3,0,18.6898,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,433.69,576.99,0.0 +gfx950,256,822,2560,2048,ck,3,0,18.6571,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,461.98,596.82,0.0 +gfx950,256,1024,2560,2048,ck,0,0,19.2318,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,558.32,654.28,0.0 +gfx950,256,1025,2560,2048,ck,0,0,20.1839,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,532.5,623.77,0.0 +gfx950,256,1027,2560,2048,ck,0,0,20.0204,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,537.9,629.58,0.0 +gfx950,256,1042,2560,2048,ck,0,0,20.1333,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,542.69,631.39,0.0 +gfx950,256,1051,2560,2048,ck,0,0,20.0161,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,550.58,638.31,0.0 +gfx950,256,1055,2560,2048,ck,0,0,20.1406,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,549.26,635.79,0.0 +gfx950,256,1057,2560,2048,ck,0,0,20.1916,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,548.91,634.89,0.0 +gfx950,256,1069,2560,2048,ck,0,0,20.2273,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,554.17,638.02,0.0 +gfx950,256,1072,2560,2048,ck,0,0,20.1992,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,556.49,639.97,0.0 +gfx950,256,1074,2560,2048,ck,0,0,20.2086,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,557.27,640.39,0.0 +gfx950,256,1091,2560,2048,ck,0,0,20.1834,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,566.8,647.22,0.0 +gfx950,256,1128,2560,2048,ck,2,0,20.3205,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,582.07,655.91,0.0 +gfx950,256,1136,2560,2048,ck,3,0,20.36,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,585.06,657.45,0.0 +gfx950,256,1514,2560,2048,ck,0,0,21.0928,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,752.65,763.07,0.0 +gfx950,256,2017,2560,2048,ck,2,0,30.0164,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,704.61,656.33,0.0 +gfx950,256,2048,2560,2048,ck,2,0,29.6162,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,725.1,672.7,0.0 +gfx950,256,2151,2560,2048,ck,2,0,29.944,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,753.24,690.0,0.0 +gfx950,256,2332,2560,2048,ck,2,0,30.174,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,810.39,727.73,0.0 +gfx950,256,2341,2560,2048,ck,2,0,30.5484,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,803.55,720.93,0.0 +gfx950,256,3072,2560,2048,ck,0,0,32.915,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,978.65,828.28,0.0 +gfx950,256,3073,2560,2048,ck,0,0,34.9774,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,921.24,779.65,0.0 +gfx950,256,3089,2560,2048,ck,0,0,33.4773,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,967.54,818.01,0.0 +gfx950,256,3110,2560,2048,ck,0,0,34.3253,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,950.05,802.19,0.0 +gfx950,256,3137,2560,2048,ck,0,0,33.8313,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,972.29,819.62,0.0 +gfx950,256,3613,2560,2048,cktile,32,0,40.624,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,932.58,766.56,0.0 +gfx950,256,4096,2560,2048,cktile,30,0,41.2557,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1041.06,838.74,0.0 +gfx950,256,4097,2560,2048,cktile,30,0,41.3552,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1038.81,836.9,0.0 +gfx950,256,4109,2560,2048,cktile,31,0,41.2896,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1043.51,840.31,0.0 +gfx950,256,4143,2560,2048,cktile,12,0,41.389,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1049.61,844.18,0.0 +gfx950,256,4345,2560,2048,cktile,32,0,41.6851,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1092.97,872.92,0.0 +gfx950,256,4413,2560,2048,cktile,32,0,41.812,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1106.71,881.93,0.0 +gfx950,256,5121,2560,2048,ck,0,0,58.3907,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,919.63,718.44,0.0 +gfx950,256,5123,2560,2048,ck,0,0,58.0935,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,924.69,722.36,0.0 +gfx950,256,5133,2560,2048,ck,0,0,58.2587,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,923.87,721.54,0.0 +gfx950,256,5185,2560,2048,ck,0,0,58.7822,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,924.92,721.46,0.0 +gfx950,256,5191,2560,2048,ck,0,0,59.1186,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,920.72,718.08,0.0 +gfx950,256,6146,2560,2048,ck,0,0,62.689,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1028.02,786.38,0.0 +gfx950,256,6147,2560,2048,ck,0,0,62.801,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1026.35,785.09,0.0 +gfx950,256,6155,2560,2048,ck,0,0,62.6518,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1030.14,787.88,0.0 +gfx950,256,6377,2560,2048,ck,0,0,62.432,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1071.05,816.14,0.0 +gfx950,256,6401,2560,2048,ck,0,0,63.784,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1052.29,801.54,0.0 +gfx950,256,7121,2560,2048,ck,0,0,72.64,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1027.93,774.87,0.0 +gfx950,256,7169,2560,2048,ck,0,0,72.847,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1031.92,777.39,0.0 +gfx950,256,7170,2560,2048,ck,0,0,73.3338,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1025.21,772.32,0.0 +gfx950,256,7176,2560,2048,ck,0,0,73.699,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1020.99,769.08,0.0 +gfx950,256,7177,2560,2048,ck,0,0,73.1647,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1028.59,774.79,0.0 +gfx950,256,7185,2560,2048,ck,0,0,73.2875,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1028.01,774.28,0.0 +gfx950,256,7206,2560,2048,ck,0,0,74.4999,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1014.23,763.7,0.0 +gfx950,256,7217,2560,2048,ck,0,0,73.4183,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1030.75,776.02,0.0 +gfx950,256,7257,2560,2048,ck,0,0,74.6868,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1018.86,766.68,0.0 +gfx950,256,7265,2560,2048,ck,0,0,73.5272,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1036.07,779.55,0.0 +gfx950,256,7393,2560,2048,ck,0,0,74.1444,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1045.54,785.44,0.0 +gfx950,256,7461,2560,2048,ck,0,0,74.2744,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1053.31,790.63,0.0 +gfx950,256,7634,2560,2048,ck,0,0,75.2188,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1064.21,797.19,0.0 +gfx950,256,8129,2560,2048,cktile,27,0,78.0657,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1091.88,813.57,0.0 +gfx950,256,8192,2560,2048,ck,0,0,77.0048,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.51,830.64,0.0 +gfx950,256,1,3072,2048,ck,8,2,6.3471,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.98,992.52,0.0 +gfx950,256,2,3072,2048,ck,8,2,6.3948,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.94,986.4,0.0 +gfx950,256,4,3072,2048,ck,8,2,6.9834,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.21,905.61,0.0 +gfx950,256,8,3072,2048,ck,8,2,6.9724,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,14.44,911.74,0.0 +gfx950,256,16,3072,2048,ck,8,2,7.184,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,28.02,894.0,0.0 +gfx950,256,32,3072,2048,ck,8,2,8.1303,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,49.53,806.07,0.0 +gfx950,256,48,3072,2048,ck,8,0,8.692,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,69.49,769.06,0.0 +gfx950,256,64,3072,2048,ck,8,0,8.6738,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,92.84,785.79,0.0 +gfx950,256,80,3072,2048,ck,8,0,8.7579,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,114.94,793.21,0.0 +gfx950,256,96,3072,2048,ck,7,0,9.6452,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,125.24,733.82,0.0 +gfx950,256,112,3072,2048,ck,7,0,10.1215,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,139.24,712.24,0.0 +gfx950,256,128,3072,2048,ck,7,0,10.082,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,159.75,728.03,0.0 +gfx950,256,256,3072,2048,ck,18,0,9.9081,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,325.11,846.64,0.0 +gfx950,256,512,3072,2048,ck,2,0,15.1008,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,426.63,694.38,0.0 +gfx950,256,588,3072,2048,ck,2,0,15.6185,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,473.72,711.23,0.0 +gfx950,256,773,3072,2048,ck,0,0,19.7657,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,492.09,638.68,0.0 +gfx950,256,822,3072,2048,ck,0,0,19.8751,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,520.41,655.36,0.0 +gfx950,256,1024,3072,2048,ck,0,0,19.6821,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,654.65,745.86,0.0 +gfx950,256,1025,3072,2048,ck,0,0,20.4709,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,630.04,717.52,0.0 +gfx950,256,1027,3072,2048,ck,0,0,20.4713,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,631.26,718.31,0.0 +gfx950,256,1042,3072,2048,ck,0,0,20.5331,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,638.55,722.13,0.0 +gfx950,256,1051,3072,2048,ck,0,0,20.4252,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,647.47,729.55,0.0 +gfx950,256,1055,3072,2048,ck,0,0,20.454,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,649.02,730.13,0.0 +gfx950,256,1057,3072,2048,ck,0,0,20.4731,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,649.64,730.25,0.0 +gfx950,256,1069,3072,2048,ck,0,0,21.1471,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,636.07,711.62,0.0 +gfx950,256,1072,3072,2048,ck,0,0,20.9624,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,643.48,719.06,0.0 +gfx950,256,1074,3072,2048,ck,0,0,20.6796,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,653.5,729.69,0.0 +gfx950,256,1091,3072,2048,ck,0,0,20.6417,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,665.06,737.77,0.0 +gfx950,256,1128,3072,2048,ck,0,0,20.8009,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,682.35,746.7,0.0 +gfx950,256,1136,3072,2048,ck,0,0,20.7475,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,688.96,751.78,0.0 +gfx950,256,1514,3072,2048,ck,2,0,28.6114,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,665.84,653.38,0.0 +gfx950,256,2017,3072,2048,ck,2,0,30.7636,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,824.99,741.61,0.0 +gfx950,256,2048,3072,2048,ck,2,0,30.8132,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,836.32,748.66,0.0 +gfx950,256,2151,3072,2048,ck,0,0,31.9397,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,847.4,748.67,0.0 +gfx950,256,2332,3072,2048,cktile,31,0,39.2603,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,747.41,646.84,0.0 +gfx950,256,2341,3072,2048,ck,0,0,32.8012,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,898.03,776.46,0.0 +gfx950,256,3072,3072,2048,cktile,31,0,40.5402,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,953.49,775.95,0.0 +gfx950,256,3073,3072,2048,cktile,30,0,40.5715,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,953.07,775.56,0.0 +gfx950,256,3089,3072,2048,cktile,12,0,40.5284,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,959.05,779.61,0.0 +gfx950,256,3110,3072,2048,cktile,12,0,40.3175,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,970.62,787.96,0.0 +gfx950,256,3137,3072,2048,cktile,32,0,40.4759,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,975.21,790.34,0.0 +gfx950,256,3613,3072,2048,cktile,32,0,41.4789,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1096.03,865.24,0.0 +gfx950,256,4096,3072,2048,ck,0,0,49.9202,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1032.44,798.19,0.0 +gfx950,256,4097,3072,2048,ck,0,0,57.9139,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,890.15,688.16,0.0 +gfx950,256,4109,3072,2048,ck,0,0,58.2443,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,887.7,685.94,0.0 +gfx950,256,4143,3072,2048,ck,0,0,58.1091,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,897.12,692.33,0.0 +gfx950,256,4345,3072,2048,ck,0,0,58.2375,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,938.79,719.22,0.0 +gfx950,256,4413,3072,2048,ck,0,0,58.9427,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,942.07,720.07,0.0 +gfx950,256,5121,3072,2048,ck,0,0,62.3415,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1033.61,773.85,0.0 +gfx950,256,5123,3072,2048,ck,0,0,62.5323,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1030.86,771.75,0.0 +gfx950,256,5133,3072,2048,ck,0,0,61.7291,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1046.32,783.12,0.0 +gfx950,256,5185,3072,2048,ck,0,0,62.049,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1051.47,785.94,0.0 +gfx950,256,5191,3072,2048,ck,0,0,62.1118,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1051.62,785.94,0.0 +gfx950,256,6146,3072,2048,ck,0,0,74.8339,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1033.42,756.87,0.0 +gfx950,256,6147,3072,2048,ck,0,0,73.5639,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1051.43,770.05,0.0 +gfx950,256,6155,3072,2048,ck,0,0,74.7039,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1036.73,759.17,0.0 +gfx950,256,6377,3072,2048,ck,0,0,74.9421,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1070.71,781.03,0.0 +gfx950,256,6401,3072,2048,ck,0,0,74.9441,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1074.71,783.63,0.0 +gfx950,256,7121,3072,2048,cktile,27,0,78.6607,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1139.11,821.59,0.0 +gfx950,256,7169,3072,2048,cktile,11,0,78.9205,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1143.01,823.87,0.0 +gfx950,256,7170,3072,2048,cktile,29,0,78.5676,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1148.3,827.67,0.0 +gfx950,256,7176,3072,2048,cktile,28,0,79.1621,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1140.63,822.08,0.0 +gfx950,256,7177,3072,2048,cktile,27,0,78.834,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1145.54,825.6,0.0 +gfx950,256,7185,3072,2048,cktile,11,0,78.4149,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1152.95,830.85,0.0 +gfx950,256,7206,3072,2048,cktile,29,0,78.6531,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1152.81,830.52,0.0 +gfx950,256,7217,3072,2048,cktile,27,0,78.5173,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1156.57,833.1,0.0 +gfx950,256,7257,3072,2048,cktile,11,0,78.9258,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1156.96,832.94,0.0 +gfx950,256,7265,3072,2048,cktile,11,0,78.9425,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1157.99,833.6,0.0 +gfx950,256,7393,3072,2048,cktile,11,0,79.0694,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1176.5,845.52,0.0 +gfx950,256,7461,3072,2048,cktile,11,0,78.7755,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1191.76,855.75,0.0 +gfx950,256,7634,3072,2048,cktile,28,0,79.33,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1210.87,867.63,0.0 +gfx950,256,8129,3072,2048,ck,0,0,91.3298,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1119.97,798.03,0.0 +gfx950,256,8192,3072,2048,ck,0,0,89.6202,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1150.18,819.02,0.0 +gfx950,256,1,4608,2048,ck,8,3,7.0728,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.67,1335.89,0.0 +gfx950,256,2,4608,2048,ck,8,3,7.6647,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.93,1234.19,0.0 +gfx950,256,4,4608,2048,ck,8,3,7.67,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.84,1236.28,0.0 +gfx950,256,8,4608,2048,ck,8,3,7.9937,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.89,1191.85,0.0 +gfx950,256,16,4608,2048,ck,8,3,8.1216,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,37.18,1184.18,0.0 +gfx950,256,24,4608,2048,ck,8,2,8.9204,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,50.78,1088.24,0.0 +gfx950,256,32,4608,2048,ck,8,0,8.8527,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,68.23,1106.74,0.0 +gfx950,256,40,4608,2048,ck,8,0,9.0193,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,83.71,1096.29,0.0 +gfx950,256,48,4608,2048,ck,8,0,8.8855,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,101.96,1122.94,0.0 +gfx950,256,56,4608,2048,ck,7,0,10.3639,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,101.99,971.45,0.0 +gfx950,256,64,4608,2048,ck,7,0,10.0451,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,120.25,1011.25,0.0 +gfx950,256,72,4608,2048,ck,7,0,10.1234,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,134.24,1012.33,0.0 +gfx950,256,80,4608,2048,ck,7,0,10.2445,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,147.39,1009.16,0.0 +gfx950,256,88,4608,2048,ck,7,0,9.9238,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,167.37,1050.85,0.0 +gfx950,256,96,4608,2048,ck,7,0,9.8503,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,183.95,1067.84,0.0 +gfx950,256,104,4608,2048,ck,7,0,10.4673,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,187.53,1013.5,0.0 +gfx950,256,112,4608,2048,ck,7,0,10.3101,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,205.03,1037.7,0.0 +gfx950,256,120,4608,2048,ck,18,0,11.7484,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,192.79,918.33,0.0 +gfx950,256,128,4608,2048,ck,18,0,10.6713,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,226.39,1019.46,0.0 +gfx950,256,136,4608,2048,ck,18,0,11.8387,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,216.82,926.54,0.0 +gfx950,256,144,4608,2048,ck,18,0,11.8608,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,229.15,932.42,0.0 +gfx950,256,152,4608,2048,ck,18,0,12.0878,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,237.34,922.36,0.0 +gfx950,256,160,4608,2048,ck,12,0,11.9427,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,252.87,941.11,0.0 +gfx950,256,168,4608,2048,ck,18,0,11.9934,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,264.39,944.65,0.0 +gfx950,256,176,4608,2048,ck,18,0,12.1793,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,272.75,937.63,0.0 +gfx950,256,184,4608,2048,ck,18,0,12.2004,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,284.65,943.39,0.0 +gfx950,256,192,4608,2048,ck,18,0,11.0097,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,329.15,1053.6,0.0 +gfx950,256,200,4608,2048,ck,12,0,13.0279,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,289.75,897.3,0.0 +gfx950,256,208,4608,2048,ck,12,0,13.3715,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,293.6,880.99,0.0 +gfx950,256,216,4608,2048,ck,12,0,13.3337,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,305.76,890.24,0.0 +gfx950,256,224,4608,2048,ck,12,0,12.3659,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,341.9,967.2,0.0 +gfx950,256,232,4608,2048,ck,2,0,15.4901,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,282.69,777.94,0.0 +gfx950,256,240,4608,2048,ck,2,0,15.6054,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,290.27,777.97,0.0 +gfx950,256,248,4608,2048,ck,2,0,15.4976,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,302.04,789.2,0.0 +gfx950,256,256,4608,2048,ck,2,0,15.5872,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,309.99,790.44,0.0 +gfx950,256,272,4608,2048,ck,2,0,15.3263,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,334.97,815.66,0.0 +gfx950,256,288,4608,2048,ck,2,0,15.6133,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,348.15,812.21,0.0 +gfx950,256,304,4608,2048,ck,2,0,15.6136,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,367.49,823.73,0.0 +gfx950,256,320,4608,2048,ck,2,0,15.3248,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,394.12,851.02,0.0 +gfx950,256,336,4608,2048,ck,2,0,15.4,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,411.8,858.56,0.0 +gfx950,256,352,4608,2048,ck,2,0,15.3083,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,434.0,875.48,0.0 +gfx950,256,368,4608,2048,ck,2,0,15.4759,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,448.81,877.64,0.0 +gfx950,256,384,4608,2048,ck,2,0,15.2474,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,475.34,902.62,0.0 +gfx950,256,400,4608,2048,ck,2,0,15.4888,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,487.43,900.18,0.0 +gfx950,256,416,4608,2048,ck,2,0,15.4917,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,506.84,911.65,0.0 +gfx950,256,432,4608,2048,ck,2,0,15.5778,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,523.42,918.18,0.0 +gfx950,256,448,4608,2048,ck,2,0,15.3938,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,549.29,940.86,0.0 +gfx950,256,464,4608,2048,ck,3,0,19.5124,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,448.83,751.51,0.0 +gfx950,256,480,4608,2048,ck,3,0,19.689,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,460.14,753.92,0.0 +gfx950,256,496,4608,2048,ck,0,0,20.0754,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,466.33,748.38,0.0 +gfx950,256,512,4608,2048,ck,0,0,19.2945,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,500.85,788.01,0.0 +gfx950,256,664,4608,2048,ck,0,0,20.321,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,616.73,832.46,0.0 +gfx950,256,762,4608,2048,ck,0,0,20.8438,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,690.0,864.54,0.0 +gfx950,256,902,4608,2048,ck,2,0,29.1331,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,584.38,672.68,0.0 +gfx950,256,961,4608,2048,ck,2,0,28.73,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,631.34,705.25,0.0 +gfx950,256,1000,4608,2048,ck,2,0,29.536,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,639.03,700.88,0.0 +gfx950,256,1001,4608,2048,ck,2,0,29.4826,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,640.83,702.53,0.0 +gfx950,256,1002,4608,2048,ck,2,0,29.2697,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,646.13,708.03,0.0 +gfx950,256,1003,4608,2048,ck,2,0,29.2259,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,647.75,709.47,0.0 +gfx950,256,1015,4608,2048,ck,2,0,29.538,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,648.57,706.55,0.0 +gfx950,256,1023,4608,2048,ck,2,0,29.4426,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,655.8,711.9,0.0 +gfx950,256,1024,4608,2048,ck,2,0,29.5315,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,654.47,710.14,0.0 +gfx950,256,1025,4608,2048,ck,2,0,29.1625,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,663.39,719.51,0.0 +gfx950,256,1032,4608,2048,ck,2,0,29.5521,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,659.12,712.69,0.0 +gfx950,256,1038,4608,2048,ck,2,0,29.192,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,671.13,723.8,0.0 +gfx950,256,1087,4608,2048,ck,2,0,29.8669,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,686.93,725.93,0.0 +gfx950,256,1255,4608,2048,ck,2,0,30.7197,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,771.08,767.37,0.0 +gfx950,256,1351,4608,2048,ck,0,0,31.8336,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,801.02,774.49,0.0 +gfx950,256,1376,4608,2048,ck,0,0,32.044,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,810.48,778.19,0.0 +gfx950,256,1380,4608,2048,ck,0,0,31.8647,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,817.41,783.99,0.0 +gfx950,256,1391,4608,2048,ck,0,0,32.3276,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,812.13,776.59,0.0 +gfx950,256,1710,4608,2048,ck,0,0,34.3904,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,938.49,834.5,0.0 +gfx950,256,1808,4608,2048,cktile,12,0,39.6031,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,861.67,752.53,0.0 +gfx950,256,1809,4608,2048,cktile,30,0,39.9584,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,854.48,746.12,0.0 +gfx950,256,1870,4608,2048,cktile,30,0,39.7596,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,887.71,767.13,0.0 +gfx950,256,1967,4608,2048,cktile,30,0,40.1129,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,925.53,787.61,0.0 +gfx950,256,1970,4608,2048,cktile,12,0,40.1093,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,927.03,788.53,0.0 +gfx950,256,2000,4608,2048,cktile,12,0,40.0509,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,942.52,798.11,0.0 +gfx950,256,2001,4608,2048,cktile,30,0,39.9799,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,944.66,799.81,0.0 +gfx950,256,2016,4608,2048,cktile,30,0,40.0965,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,948.98,801.7,0.0 +gfx950,256,2017,4608,2048,cktile,30,0,40.1514,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,948.15,800.89,0.0 +gfx950,256,2046,4608,2048,cktile,12,0,40.4009,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,955.84,804.02,0.0 +gfx950,256,2048,4608,2048,cktile,31,0,40.2325,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,960.78,807.95,0.0 +gfx950,256,2050,4608,2048,cktile,31,0,40.0684,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,965.66,811.82,0.0 +gfx950,256,2081,4608,2048,cktile,12,0,40.509,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,969.6,811.61,0.0 +gfx950,256,2095,4608,2048,cktile,32,0,40.1622,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,984.55,822.55,0.0 +gfx950,256,2154,4608,2048,cktile,32,0,40.7532,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,997.6,826.93,0.0 +gfx950,256,2159,4608,2048,cktile,32,0,40.5418,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1005.13,832.63,0.0 +gfx950,256,2162,4608,2048,cktile,32,0,40.6522,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1003.79,831.2,0.0 +gfx950,256,2165,4608,2048,cktile,31,0,40.6721,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1004.69,831.62,0.0 +gfx950,256,2251,4608,2048,cktile,32,0,40.5101,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1048.78,858.86,0.0 +gfx950,256,2252,4608,2048,cktile,30,0,40.6519,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1045.59,856.14,0.0 +gfx950,256,2255,4608,2048,cktile,32,0,40.7828,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1043.62,854.22,0.0 +gfx950,256,2317,4608,2048,cktile,31,0,41.1717,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1062.18,863.11,0.0 +gfx950,256,2694,4608,2048,ck,0,0,57.8566,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,878.85,687.6,0.0 +gfx950,256,2715,4608,2048,ck,0,0,58.3055,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,878.89,686.37,0.0 +gfx950,256,2819,4608,2048,ck,0,0,58.717,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,906.16,701.51,0.0 +gfx950,256,2853,4608,2048,ck,0,0,57.987,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,928.63,716.94,0.0 +gfx950,256,2914,4608,2048,ck,0,0,58.6074,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,938.45,721.08,0.0 +gfx950,256,2974,4608,2048,ck,0,0,59.1986,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,948.2,725.29,0.0 +gfx950,256,3000,4608,2048,ck,0,0,58.883,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,961.62,734.15,0.0 +gfx950,256,3001,4608,2048,ck,0,0,59.1954,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,956.86,730.47,0.0 +gfx950,256,3072,4608,2048,ck,0,0,58.5167,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,990.86,752.61,0.0 +gfx950,256,3073,4608,2048,ck,0,0,59.4195,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,976.13,741.36,0.0 +gfx950,256,3077,4608,2048,ck,0,0,60.6998,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,956.78,726.47,0.0 +gfx950,256,3078,4608,2048,ck,0,0,60.0995,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,966.65,733.91,0.0 +gfx950,256,3086,4608,2048,ck,2,0,66.0495,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,881.86,669.16,0.0 +gfx950,256,3089,4608,2048,ck,0,0,59.6379,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,977.62,741.67,0.0 +gfx950,256,3118,4608,2048,ck,0,0,59.8175,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,983.83,744.9,0.0 +gfx950,256,3138,4608,2048,ck,0,0,59.4051,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,997.01,753.87,0.0 +gfx950,256,3154,4608,2048,ck,0,0,59.7235,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,996.76,752.87,0.0 +gfx950,256,3289,4608,2048,ck,0,0,61.3967,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1011.09,757.12,0.0 +gfx950,256,3437,4608,2048,ck,0,0,61.5567,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1053.84,782.23,0.0 +gfx950,256,3617,4608,2048,ck,0,0,72.074,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,947.2,696.22,0.0 +gfx950,256,3620,4608,2048,ck,0,0,71.4732,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,955.96,702.54,0.0 +gfx950,256,3669,4608,2048,ck,0,0,72.6692,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,952.95,698.57,0.0 +gfx950,256,3824,4608,2048,ck,0,0,72.8476,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,990.78,720.83,0.0 +gfx950,256,3858,4608,2048,ck,0,0,73.9592,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,984.56,715.17,0.0 +gfx950,256,3906,4608,2048,ck,0,0,73.6152,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1001.47,725.86,0.0 +gfx950,256,4000,4608,2048,cktile,11,0,77.4818,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,974.39,703.3,0.0 +gfx950,256,4004,4608,2048,ck,0,0,74.1832,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1018.73,735.18,0.0 +gfx950,256,4100,4608,2048,ck,0,0,74.4308,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1039.69,747.27,0.0 +gfx950,256,4108,4608,2048,ck,0,0,74.8572,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1035.78,744.21,0.0 +gfx950,256,4121,4608,2048,ck,0,0,75.9184,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1024.54,735.74,0.0 +gfx950,256,4123,4608,2048,ck,0,0,74.898,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1039.0,746.06,0.0 +gfx950,256,4149,4608,2048,ck,0,0,75.1612,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1041.89,747.35,0.0 +gfx950,256,4204,4608,2048,ck,0,0,75.528,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1050.58,751.92,0.0 +gfx950,256,4223,4608,2048,ck,0,0,74.8364,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1065.08,761.73,0.0 +gfx950,256,4229,4608,2048,ck,0,0,76.6516,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1041.33,744.57,0.0 +gfx950,256,4379,4608,2048,ck,0,0,76.8308,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1075.75,764.83,0.0 +gfx950,256,4410,4608,2048,ck,0,0,76.3492,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1090.2,774.22,0.0 +gfx950,256,4498,4608,2048,cktile,29,0,78.4003,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1082.86,766.61,0.0 +gfx950,256,4585,4608,2048,cktile,11,0,77.9639,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1109.99,783.47,0.0 +gfx950,256,4586,4608,2048,cktile,29,0,78.2003,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1106.87,781.25,0.0 +gfx950,256,4723,4608,2048,cktile,29,0,78.6625,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1133.24,796.28,0.0 +gfx950,256,4743,4608,2048,cktile,28,0,79.0142,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1132.98,795.58,0.0 +gfx950,256,4822,4608,2048,cktile,29,0,78.7063,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1156.35,810.0,0.0 +gfx950,256,4898,4608,2048,cktile,29,0,79.2713,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1166.21,815.03,0.0 +gfx950,256,4936,4608,2048,cktile,11,0,79.191,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1176.45,821.26,0.0 +gfx950,256,4992,4608,2048,cktile,11,0,78.9187,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1193.9,832.09,0.0 +gfx950,256,5000,4608,2048,cktile,11,0,79.1708,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1192.0,830.57,0.0 +gfx950,256,5001,4608,2048,cktile,11,0,79.3032,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1190.25,829.33,0.0 +gfx950,256,5002,4608,2048,cktile,29,0,79.4845,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1187.77,827.58,0.0 +gfx950,256,5003,4608,2048,cktile,28,0,78.7704,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1198.78,835.22,0.0 +gfx950,256,5004,4608,2048,cktile,11,0,79.5979,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1186.56,826.68,0.0 +gfx950,256,5005,4608,2048,cktile,29,0,79.4023,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1189.72,828.86,0.0 +gfx950,256,5006,4608,2048,cktile,27,0,79.421,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1189.67,828.81,0.0 +gfx950,256,5010,4608,2048,cktile,11,0,79.1324,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1194.97,832.4,0.0 +gfx950,256,5011,4608,2048,cktile,11,0,79.7033,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1186.64,826.58,0.0 +gfx950,256,5012,4608,2048,cktile,29,0,79.3943,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1191.5,829.94,0.0 +gfx950,256,5026,4608,2048,cktile,29,0,79.2291,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1197.32,833.66,0.0 +gfx950,256,5028,4608,2048,cktile,29,0,79.3918,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1195.34,832.23,0.0 +gfx950,256,5030,4608,2048,cktile,28,0,79.5507,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1193.43,830.86,0.0 +gfx950,256,5031,4608,2048,cktile,27,0,79.6919,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1191.55,829.52,0.0 +gfx950,256,5060,4608,2048,cktile,28,0,79.6297,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1199.36,834.27,0.0 +gfx950,256,5062,4608,2048,cktile,11,0,79.6787,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1199.09,834.04,0.0 +gfx950,256,5063,4608,2048,cktile,28,0,79.7771,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1197.85,833.16,0.0 +gfx950,256,5091,4608,2048,cktile,11,0,79.7431,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1204.99,837.47,0.0 +gfx950,256,5105,4608,2048,cktile,11,0,78.5308,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1226.95,852.4,0.0 +gfx950,256,5120,4608,2048,cktile,11,0,79.2362,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1219.6,846.95,0.0 +gfx950,256,5122,4608,2048,cktile,28,0,79.3724,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1217.99,845.78,0.0 +gfx950,256,5123,4608,2048,cktile,28,0,79.5084,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1216.14,844.47,0.0 +gfx950,256,5124,4608,2048,cktile,11,0,79.9187,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1210.13,840.28,0.0 +gfx950,256,5127,4608,2048,cktile,11,0,79.5839,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1215.94,844.24,0.0 +gfx950,256,5177,4608,2048,cktile,11,0,79.1778,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1234.09,855.68,0.0 +gfx950,256,5183,4608,2048,cktile,27,0,79.3254,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1233.22,854.94,0.0 +gfx950,256,5252,4608,2048,cktile,27,0,79.4265,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1248.05,863.64,0.0 +gfx950,256,5255,4608,2048,cktile,11,0,79.4402,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1248.55,863.91,0.0 +gfx950,256,5361,4608,2048,cktile,27,0,79.8713,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1266.86,874.2,0.0 +gfx950,256,5427,4608,2048,ck,0,0,97.3792,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1051.88,724.66,0.0 +gfx950,256,5471,4608,2048,ck,0,0,98.3201,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1050.26,722.77,0.0 +gfx950,256,5534,4608,2048,ck,0,0,98.2565,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1063.04,730.46,0.0 +gfx950,256,5564,4608,2048,ck,0,0,98.2236,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1069.16,734.14,0.0 +gfx950,256,5671,4608,2048,ck,0,0,98.8205,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1083.14,741.9,0.0 +gfx950,256,5927,4608,2048,ck,0,0,101.3921,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1103.32,751.53,0.0 +gfx950,256,5954,4608,2048,ck,0,0,100.7081,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.88,759.65,0.0 +gfx950,256,6000,4608,2048,ck,0,0,102.2433,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1107.61,753.31,0.0 +gfx950,256,6002,4608,2048,ck,0,0,101.4225,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1116.95,759.63,0.0 +gfx950,256,6146,4608,2048,ck,0,0,103.0053,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1126.17,763.71,0.0 +gfx950,256,6194,4608,2048,ck,0,0,102.3617,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1142.11,773.79,0.0 +gfx950,256,6227,4608,2048,ck,0,0,102.6182,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1145.32,775.48,0.0 +gfx950,256,6229,4608,2048,ck,0,0,102.3702,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1148.46,777.58,0.0 +gfx950,256,6234,4608,2048,ck,0,0,102.635,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1146.42,776.12,0.0 +gfx950,256,6246,4608,2048,ck,0,0,102.7539,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1147.3,776.54,0.0 +gfx950,256,6304,4608,2048,ck,0,0,110.412,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1077.64,728.59,0.0 +gfx950,256,6314,4608,2048,cktile,27,0,110.8873,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1074.72,726.49,0.0 +gfx950,256,6318,4608,2048,cktile,28,0,111.027,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1074.05,725.98,0.0 +gfx950,256,6341,4608,2048,ck,0,0,110.6076,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1082.04,731.07,0.0 +gfx950,256,6377,4608,2048,ck,0,0,109.8924,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1095.27,739.52,0.0 +gfx950,256,6476,4608,2048,ck,0,0,110.0884,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1110.29,748.33,0.0 +gfx950,256,6483,4608,2048,ck,0,0,111.5896,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1096.54,738.97,0.0 +gfx950,256,6492,4608,2048,ck,0,0,109.8748,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.2,751.43,0.0 +gfx950,256,6519,4608,2048,ck,0,0,110.534,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1113.16,749.7,0.0 +gfx950,256,6609,4608,2048,ck,0,0,110.982,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1123.97,755.81,0.0 +gfx950,256,6811,4608,2048,cktile,11,0,112.7618,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1140.04,764.06,0.0 +gfx950,256,6813,4608,2048,cktile,29,0,112.5162,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1142.87,765.92,0.0 +gfx950,256,6818,4608,2048,cktile,27,0,112.977,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1139.04,763.3,0.0 +gfx950,256,6830,4608,2048,cktile,11,0,112.9106,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1141.72,764.94,0.0 +gfx950,256,7001,4608,2048,cktile,27,0,112.2277,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1177.42,786.76,0.0 +gfx950,256,7025,4608,2048,cktile,11,0,112.9981,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1173.4,783.79,0.0 +gfx950,256,7121,4608,2048,cktile,11,0,112.2616,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1197.24,798.56,0.0 +gfx950,256,7169,4608,2048,cktile,11,0,112.568,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1202.03,801.19,0.0 +gfx950,256,7177,4608,2048,cktile,11,0,113.0119,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1198.65,798.84,0.0 +gfx950,256,7217,4608,2048,cktile,29,0,112.4295,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1211.57,806.99,0.0 +gfx950,256,7223,4608,2048,cktile,28,0,112.97,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1206.78,803.73,0.0 +gfx950,256,7238,4608,2048,cktile,11,0,112.8224,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1210.86,806.28,0.0 +gfx950,256,7246,4608,2048,cktile,29,0,112.6329,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1214.24,808.43,0.0 +gfx950,256,7249,4608,2048,cktile,11,0,112.8662,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1212.23,807.06,0.0 +gfx950,256,7316,4608,2048,cktile,29,0,113.7013,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1214.45,807.77,0.0 +gfx950,256,7393,4608,2048,cktile,11,0,113.5041,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1229.37,816.82,0.0 +gfx950,256,7491,4608,2048,cktile,28,0,113.8772,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1241.58,823.83,0.0 +gfx950,256,7558,4608,2048,cktile,28,0,113.9509,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1251.88,829.92,0.0 +gfx950,256,7650,4608,2048,cktile,29,0,113.9294,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1267.35,839.18,0.0 +gfx950,256,7753,4608,2048,cktile,11,0,113.6156,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1287.97,851.71,0.0 +gfx950,256,7774,4608,2048,cktile,28,0,113.7991,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1289.37,852.41,0.0 +gfx950,256,7815,4608,2048,cktile,28,0,113.3843,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1300.91,859.6,0.0 +gfx950,256,7834,4608,2048,cktile,28,0,113.5747,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1301.89,860.05,0.0 +gfx950,256,7836,4608,2048,cktile,27,0,113.8783,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1298.75,857.95,0.0 +gfx950,256,7871,4608,2048,cktile,29,0,114.0979,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1302.04,859.75,0.0 +gfx950,256,7899,4608,2048,cktile,29,0,113.9587,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1308.27,863.57,0.0 +gfx950,256,7913,4608,2048,cktile,27,0,114.1395,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1308.51,863.59,0.0 +gfx950,256,7930,4608,2048,cktile,11,0,113.6046,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1317.5,869.34,0.0 +gfx950,256,7961,4608,2048,cktile,28,0,113.3604,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1325.5,874.29,0.0 +gfx950,256,7963,4608,2048,cktile,11,0,114.2922,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1315.02,867.36,0.0 +gfx950,256,7974,4608,2048,cktile,28,0,114.2869,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1316.9,868.48,0.0 +gfx950,256,8016,4608,2048,cktile,28,0,114.472,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1321.69,871.21,0.0 +gfx950,256,8017,4608,2048,cktile,29,0,114.2498,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1324.43,873.01,0.0 +gfx950,256,8103,4608,2048,cktile,27,0,128.9244,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1186.27,781.15,0.0 +gfx950,256,8128,4608,2048,cktile,11,0,129.314,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1186.34,780.97,0.0 +gfx950,256,8129,4608,2048,cktile,28,0,128.2055,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1196.75,787.82,0.0 +gfx950,256,8150,4608,2048,cktile,28,0,129.0731,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1191.78,784.35,0.0 +gfx950,256,8184,4608,2048,cktile,28,0,128.7103,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1200.12,789.54,0.0 +gfx950,256,8185,4608,2048,cktile,29,0,129.3407,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1194.42,785.78,0.0 +gfx950,256,8192,4608,2048,ck,0,0,126.9213,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1218.23,801.38,0.0 +gfx950,256,1,6144,2048,ck,8,3,7.1462,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.52,1762.79,0.0 +gfx950,256,2,6144,2048,ck,8,3,7.7343,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.51,1630.6,0.0 +gfx950,256,4,6144,2048,ck,8,3,7.7739,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.95,1625.99,0.0 +gfx950,256,8,6144,2048,ck,8,2,8.0312,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.07,1581.03,0.0 +gfx950,256,16,6144,2048,ck,8,2,8.3729,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,48.09,1530.21,0.0 +gfx950,256,24,6144,2048,ck,8,2,9.0915,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,66.43,1421.87,0.0 +gfx950,256,32,6144,2048,ck,6,0,14.3518,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_1x1_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,56.11,908.71,0.0 +gfx950,256,40,6144,2048,ck,7,0,10.4014,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,96.78,1264.86,0.0 +gfx950,256,48,6144,2048,ck,7,0,10.305,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,117.22,1287.83,0.0 +gfx950,256,56,6144,2048,ck,7,0,10.5349,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,133.77,1270.61,0.0 +gfx950,256,64,6144,2048,ck,7,0,10.235,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,157.36,1319.04,0.0 +gfx950,256,72,6144,2048,ck,7,0,10.2488,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,176.8,1328.46,0.0 +gfx950,256,80,6144,2048,ck,7,0,10.2564,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,196.29,1338.66,0.0 +gfx950,256,88,6144,2048,ck,18,0,11.4901,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,192.74,1204.91,0.0 +gfx950,256,96,6144,2048,ck,18,0,11.4203,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,211.55,1222.31,0.0 +gfx950,256,104,6144,2048,ck,18,0,11.8589,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,220.7,1186.78,0.0 +gfx950,256,112,6144,2048,ck,18,0,11.9214,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,236.43,1190.17,0.0 +gfx950,256,120,6144,2048,ck,18,0,11.8618,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,254.59,1205.82,0.0 +gfx950,256,128,6144,2048,ck,18,0,10.8745,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,296.22,1325.85,0.0 +gfx950,256,136,6144,2048,ck,12,0,13.0632,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,262.0,1112.48,0.0 +gfx950,256,144,6144,2048,ck,12,0,12.9269,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,280.34,1133.09,0.0 +gfx950,256,152,6144,2048,ck,12,0,12.8599,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,297.45,1147.91,0.0 +gfx950,256,160,6144,2048,ck,12,0,12.3341,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,326.46,1206.14,0.0 +gfx950,256,168,6144,2048,ck,3,0,16.1668,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,261.51,927.29,0.0 +gfx950,256,176,6144,2048,ck,2,0,16.564,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,267.4,911.98,0.0 +gfx950,256,184,6144,2048,ck,2,0,16.42,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,282.0,926.96,0.0 +gfx950,256,192,6144,2048,ck,2,0,15.2464,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,316.92,1005.84,0.0 +gfx950,256,200,6144,2048,ck,2,0,15.3004,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,328.96,1009.78,0.0 +gfx950,256,208,6144,2048,ck,2,0,15.3587,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,340.82,1013.42,0.0 +gfx950,256,216,6144,2048,ck,2,0,15.3538,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,354.04,1021.21,0.0 +gfx950,256,224,6144,2048,ck,2,0,15.4829,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,364.09,1020.1,0.0 +gfx950,256,232,6144,2048,ck,2,0,15.4377,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,378.2,1030.52,0.0 +gfx950,256,240,6144,2048,ck,2,0,15.5234,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,389.08,1032.22,0.0 +gfx950,256,248,6144,2048,ck,2,0,15.492,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,402.86,1041.71,0.0 +gfx950,256,256,6144,2048,ck,2,0,15.2952,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,421.21,1062.62,0.0 +gfx950,256,272,6144,2048,ck,2,0,15.4594,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,442.78,1066.17,0.0 +gfx950,256,288,6144,2048,ck,2,0,15.7035,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,461.54,1064.2,0.0 +gfx950,256,304,6144,2048,ck,2,0,16.57,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,461.7,1022.39,0.0 +gfx950,256,320,6144,2048,ck,2,0,15.6725,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,513.83,1095.58,0.0 +gfx950,256,336,6144,2048,ck,3,0,19.087,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,443.01,911.61,0.0 +gfx950,256,352,6144,2048,ck,3,0,19.2438,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,460.32,916.1,0.0 +gfx950,256,368,6144,2048,ck,3,0,19.4599,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,475.9,917.71,0.0 +gfx950,256,384,6144,2048,ck,0,0,19.2787,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,501.26,938.23,0.0 +gfx950,256,400,6144,2048,ck,0,0,19.7987,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,508.43,925.18,0.0 +gfx950,256,416,6144,2048,ck,0,0,19.9708,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,524.21,928.69,0.0 +gfx950,256,432,6144,2048,ck,0,0,20.1293,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,540.09,932.77,0.0 +gfx950,256,448,6144,2048,ck,0,0,20.2676,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,556.27,937.73,0.0 +gfx950,256,464,6144,2048,ck,0,0,20.2733,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,575.98,948.78,0.0 +gfx950,256,480,6144,2048,ck,0,0,20.4057,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,591.97,953.86,0.0 +gfx950,256,496,6144,2048,ck,0,0,20.5701,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,606.82,957.39,0.0 +gfx950,256,512,6144,2048,ck,0,0,19.6396,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,656.07,1014.43,0.0 +gfx950,256,664,6144,2048,ck,2,0,28.4642,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,587.06,776.48,0.0 +gfx950,256,762,6144,2048,ck,2,0,28.8483,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,664.73,814.85,0.0 +gfx950,256,902,6144,2048,ck,2,0,30.3879,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,746.99,839.61,0.0 +gfx950,256,961,6144,2048,ck,2,0,30.3457,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,796.96,868.65,0.0 +gfx950,256,1000,6144,2048,ck,2,0,31.3273,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,803.32,859.28,0.0 +gfx950,256,1001,6144,2048,ck,2,0,31.2785,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,805.38,861.08,0.0 +gfx950,256,1002,6144,2048,ck,2,0,30.7465,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,820.13,876.44,0.0 +gfx950,256,1003,6144,2048,ck,2,0,31.5633,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,799.7,854.22,0.0 +gfx950,256,1015,6144,2048,ck,2,0,31.7344,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,804.91,855.03,0.0 +gfx950,256,1023,6144,2048,ck,2,0,31.3041,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,822.4,870.45,0.0 +gfx950,256,1024,6144,2048,ck,0,0,31.7313,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,812.13,859.18,0.0 +gfx950,256,1025,6144,2048,ck,0,0,32.9492,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,782.87,827.86,0.0 +gfx950,256,1032,6144,2048,ck,0,0,32.7757,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,792.39,835.3,0.0 +gfx950,256,1038,6144,2048,ck,0,0,32.5666,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,802.11,843.31,0.0 +gfx950,256,1087,6144,2048,ck,0,0,32.3185,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,846.43,871.52,0.0 +gfx950,256,1255,6144,2048,ck,0,0,34.4389,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,917.08,887.79,0.0 +gfx950,256,1351,6144,2048,cktile,12,0,39.731,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,855.73,804.18,0.0 +gfx950,256,1376,6144,2048,cktile,31,0,39.6114,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,874.2,815.66,0.0 +gfx950,256,1380,6144,2048,cktile,31,0,39.68,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,875.22,815.69,0.0 +gfx950,256,1391,6144,2048,cktile,32,0,39.7942,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,879.67,817.31,0.0 +gfx950,256,1710,6144,2048,cktile,32,0,40.1807,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1071.0,923.27,0.0 +gfx950,256,1808,6144,2048,cktile,32,0,40.9486,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1111.14,940.26,0.0 +gfx950,256,1809,6144,2048,cktile,12,0,41.4541,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1098.2,929.14,0.0 +gfx950,256,1870,6144,2048,cktile,30,0,41.2348,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1141.27,955.29,0.0 +gfx950,256,1967,6144,2048,ck,0,0,50.2701,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,984.7,811.25,0.0 +gfx950,256,1970,6144,2048,ck,0,0,49.7949,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,995.62,819.86,0.0 +gfx950,256,2000,6144,2048,ck,0,0,49.6955,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1012.8,830.15,0.0 +gfx950,256,2001,6144,2048,ck,0,0,50.4797,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,997.57,817.54,0.0 +gfx950,256,2016,6144,2048,ck,0,0,50.6721,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1001.23,818.68,0.0 +gfx950,256,2017,6144,2048,ck,0,0,50.3885,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1007.36,823.57,0.0 +gfx950,256,2046,6144,2048,ck,0,0,51.2889,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1003.91,817.22,0.0 +gfx950,256,2048,6144,2048,ck,0,0,49.792,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1035.1,842.37,0.0 +gfx950,256,2050,6144,2048,ck,0,0,57.7804,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,892.86,726.4,0.0 +gfx950,256,2081,6144,2048,ck,0,0,58.4126,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,896.55,726.15,0.0 +gfx950,256,2095,6144,2048,ck,0,0,58.295,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,904.41,731.05,0.0 +gfx950,256,2154,6144,2048,ck,0,0,58.3395,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,929.17,745.0,0.0 +gfx950,256,2159,6144,2048,ck,0,0,58.6267,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,926.76,742.57,0.0 +gfx950,256,2162,6144,2048,ck,0,0,58.8944,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,923.83,739.92,0.0 +gfx950,256,2165,6144,2048,ck,0,0,58.5918,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,929.89,744.48,0.0 +gfx950,256,2251,6144,2048,ck,0,0,59.0894,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,958.69,759.07,0.0 +gfx950,256,2252,6144,2048,ck,0,0,58.9366,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,961.6,761.29,0.0 +gfx950,256,2255,6144,2048,ck,0,0,59.1658,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,959.15,759.06,0.0 +gfx950,256,2317,6144,2048,ck,0,0,59.2654,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,983.87,772.79,0.0 +gfx950,256,2694,6144,2048,ck,0,0,72.5039,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,935.08,706.23,0.0 +gfx950,256,2715,6144,2048,ck,0,0,72.3971,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,943.76,711.43,0.0 +gfx950,256,2819,6144,2048,ck,0,0,72.3223,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,980.92,732.78,0.0 +gfx950,256,2853,6144,2048,ck,0,0,73.1979,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,980.88,730.67,0.0 +gfx950,256,2914,6144,2048,ck,0,0,72.9171,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1005.71,745.48,0.0 +gfx950,256,2974,6144,2048,ck,0,0,73.6987,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1015.53,749.24,0.0 +gfx950,256,3000,6144,2048,ck,0,0,73.8311,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1022.57,752.95,0.0 +gfx950,256,3001,6144,2048,ck,0,0,73.8287,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1022.94,753.17,0.0 +gfx950,256,3072,6144,2048,ck,0,0,72.5024,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1066.3,780.98,0.0 +gfx950,256,3073,6144,2048,ck,0,0,75.2756,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1027.35,752.4,0.0 +gfx950,256,3077,6144,2048,ck,0,0,74.8732,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1034.22,757.21,0.0 +gfx950,256,3078,6144,2048,ck,0,0,74.68,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1037.23,759.36,0.0 +gfx950,256,3086,6144,2048,cktile,28,0,77.0254,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1008.26,737.73,0.0 +gfx950,256,3089,6144,2048,ck,0,0,74.2548,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1046.9,765.83,0.0 +gfx950,256,3118,6144,2048,ck,0,0,76.3388,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1027.88,750.37,0.0 +gfx950,256,3138,6144,2048,ck,0,0,74.7804,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1056.03,769.84,0.0 +gfx950,256,3154,6144,2048,ck,0,0,75.1908,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1055.62,768.69,0.0 +gfx950,256,3289,6144,2048,ck,0,0,75.0748,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1102.51,795.66,0.0 +gfx950,256,3437,6144,2048,cktile,27,0,78.0738,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1107.86,792.27,0.0 +gfx950,256,3617,6144,2048,cktile,28,0,78.1166,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1165.24,824.87,0.0 +gfx950,256,3620,6144,2048,cktile,28,0,78.1755,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1165.33,824.8,0.0 +gfx950,256,3669,6144,2048,cktile,28,0,78.4939,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1176.31,830.4,0.0 +gfx950,256,3824,6144,2048,cktile,28,0,78.7606,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1221.86,855.81,0.0 +gfx950,256,3858,6144,2048,cktile,28,0,79.3294,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1223.88,855.81,0.0 +gfx950,256,3906,6144,2048,cktile,28,0,79.5219,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1236.11,862.4,0.0 +gfx950,256,4000,6144,2048,cktile,29,0,80.0798,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1257.04,873.22,0.0 +gfx950,256,4004,6144,2048,cktile,28,0,80.2699,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1255.31,871.86,0.0 +gfx950,256,4100,6144,2048,ck,0,0,98.819,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1044.13,722.13,0.0 +gfx950,256,4108,6144,2048,ck,0,0,98.483,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1049.74,725.76,0.0 +gfx950,256,4121,6144,2048,ck,0,0,99.291,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1044.49,721.73,0.0 +gfx950,256,4123,6144,2048,ck,0,0,97.3962,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1065.33,736.07,0.0 +gfx950,256,4149,6144,2048,ck,0,0,99.053,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1054.11,727.52,0.0 +gfx950,256,4204,6144,2048,ck,0,0,98.9506,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1069.19,736.24,0.0 +gfx950,256,4223,6144,2048,ck,0,0,99.9354,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1063.44,731.71,0.0 +gfx950,256,4229,6144,2048,ck,0,0,100.7742,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1056.09,726.47,0.0 +gfx950,256,4379,6144,2048,ck,0,0,99.201,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1110.89,759.67,0.0 +gfx950,256,4410,6144,2048,ck,0,0,99.5754,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1114.55,761.28,0.0 +gfx950,256,4498,6144,2048,ck,0,0,100.3558,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1127.95,767.93,0.0 +gfx950,256,4585,6144,2048,ck,0,0,100.5363,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1147.7,778.96,0.0 +gfx950,256,4586,6144,2048,cktile,11,0,111.5145,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1034.94,702.4,0.0 +gfx950,256,4723,6144,2048,ck,0,0,101.9207,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1166.18,787.79,0.0 +gfx950,256,4743,6144,2048,ck,0,0,109.9695,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1085.41,732.74,0.0 +gfx950,256,4822,6144,2048,ck,0,0,111.2368,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1090.91,734.57,0.0 +gfx950,256,4898,6144,2048,ck,0,0,110.6756,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1113.73,748.14,0.0 +gfx950,256,4936,6144,2048,ck,0,0,110.6856,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.26,752.99,0.0 +gfx950,256,4992,6144,2048,ck,0,0,108.5943,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1156.85,774.89,0.0 +gfx950,256,5000,6144,2048,ck,0,0,112.1236,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1122.24,751.52,0.0 +gfx950,256,5001,6144,2048,ck,0,0,112.8696,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1115.04,746.68,0.0 +gfx950,256,5002,6144,2048,ck,0,0,111.3312,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1130.68,757.12,0.0 +gfx950,256,5003,6144,2048,ck,0,0,111.4484,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1129.71,756.46,0.0 +gfx950,256,5004,6144,2048,cktile,11,0,112.8586,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1115.82,747.13,0.0 +gfx950,256,5005,6144,2048,ck,0,0,110.9815,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1134.92,759.9,0.0 +gfx950,256,5006,6144,2048,ck,0,0,111.4596,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1130.28,756.77,0.0 +gfx950,256,5010,6144,2048,ck,0,0,111.3228,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1132.57,758.21,0.0 +gfx950,256,5011,6144,2048,ck,0,0,112.1708,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1124.23,752.61,0.0 +gfx950,256,5012,6144,2048,cktile,28,0,112.5118,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1121.05,750.45,0.0 +gfx950,256,5026,6144,2048,ck,0,0,111.2624,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1136.8,760.69,0.0 +gfx950,256,5028,6144,2048,ck,0,0,111.1032,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1138.88,762.03,0.0 +gfx950,256,5030,6144,2048,ck,0,0,111.6172,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1134.09,758.78,0.0 +gfx950,256,5031,6144,2048,ck,0,0,111.3288,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1137.26,760.88,0.0 +gfx950,256,5060,6144,2048,ck,0,0,111.0543,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1146.64,766.5,0.0 +gfx950,256,5062,6144,2048,ck,0,0,111.8047,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1139.39,761.61,0.0 +gfx950,256,5063,6144,2048,ck,0,0,111.5107,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1142.62,763.75,0.0 +gfx950,256,5091,6144,2048,ck,0,0,111.5855,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1148.17,766.83,0.0 +gfx950,256,5105,6144,2048,ck,0,0,111.5887,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1151.3,768.61,0.0 +gfx950,256,5120,6144,2048,ck,0,0,109.8307,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1173.16,782.87,0.0 +gfx950,256,5122,6144,2048,ck,0,0,112.0267,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1150.61,767.78,0.0 +gfx950,256,5123,6144,2048,ck,0,0,111.7391,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1153.8,769.88,0.0 +gfx950,256,5124,6144,2048,ck,0,0,112.4679,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1146.55,765.02,0.0 +gfx950,256,5127,6144,2048,ck,0,0,113.2751,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1139.04,759.95,0.0 +gfx950,256,5177,6144,2048,cktile,11,0,112.7762,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1155.24,769.67,0.0 +gfx950,256,5183,6144,2048,cktile,29,0,114.0848,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1143.31,761.59,0.0 +gfx950,256,5252,6144,2048,cktile,29,0,112.7518,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1172.23,779.37,0.0 +gfx950,256,5255,6144,2048,cktile,27,0,113.3791,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1166.41,775.44,0.0 +gfx950,256,5361,6144,2048,cktile,28,0,113.1696,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1192.14,790.3,0.0 +gfx950,256,5427,6144,2048,cktile,28,0,112.3191,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1215.95,804.71,0.0 +gfx950,256,5471,6144,2048,cktile,29,0,113.1175,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1217.16,804.61,0.0 +gfx950,256,5534,6144,2048,cktile,27,0,113.088,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1231.5,812.8,0.0 +gfx950,256,5564,6144,2048,cktile,29,0,113.2764,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1236.11,815.25,0.0 +gfx950,256,5671,6144,2048,cktile,27,0,112.9223,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1263.84,831.39,0.0 +gfx950,256,5927,6144,2048,cktile,28,0,114.1084,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1307.16,854.91,0.0 +gfx950,256,5954,6144,2048,cktile,28,0,114.7564,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1305.7,853.46,0.0 +gfx950,256,6000,6144,2048,cktile,28,0,114.3886,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1320.02,861.96,0.0 +gfx950,256,6002,6144,2048,cktile,28,0,114.3359,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1321.07,862.61,0.0 +gfx950,256,6146,6144,2048,ck,0,0,136.0023,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1137.25,740.37,0.0 +gfx950,256,6194,6144,2048,ck,0,0,135.7784,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1148.03,746.66,0.0 +gfx950,256,6227,6144,2048,ck,0,0,135.2668,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1158.51,752.98,0.0 +gfx950,256,6229,6144,2048,ck,0,0,136.6913,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1146.8,745.34,0.0 +gfx950,256,6234,6144,2048,ck,0,0,135.9465,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1154.01,749.95,0.0 +gfx950,256,6246,6144,2048,ck,0,0,136.1362,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1154.62,750.17,0.0 +gfx950,256,6304,6144,2048,ck,0,0,136.1042,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1165.62,756.46,0.0 +gfx950,256,6314,6144,2048,ck,0,0,136.4754,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1164.29,755.45,0.0 +gfx950,256,6318,6144,2048,ck,0,0,135.0606,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1177.23,763.79,0.0 +gfx950,256,6341,6144,2048,ck,0,0,136.7254,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1167.13,756.9,0.0 +gfx950,256,6377,6144,2048,ck,0,0,135.7821,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1181.91,765.96,0.0 +gfx950,256,6476,6144,2048,ck,0,0,137.4454,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1185.74,767.02,0.0 +gfx950,256,6483,6144,2048,ck,0,0,137.1714,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1189.39,769.28,0.0 +gfx950,256,6492,6144,2048,ck,0,0,137.1706,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1191.05,770.22,0.0 +gfx950,256,6519,6144,2048,ck,0,0,138.9126,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1181.0,763.35,0.0 +gfx950,256,6609,6144,2048,ck,0,0,138.2234,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1203.28,776.49,0.0 +gfx950,256,6811,6144,2048,cktile,29,0,146.8029,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1167.58,750.84,0.0 +gfx950,256,6813,6144,2048,ck,0,0,146.7747,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1168.15,751.18,0.0 +gfx950,256,6818,6144,2048,ck,0,0,145.8932,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1176.07,756.21,0.0 +gfx950,256,6830,6144,2048,cktile,27,0,146.5697,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1172.7,753.89,0.0 +gfx950,256,7001,6144,2048,cktile,27,0,146.4497,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1203.05,771.25,0.0 +gfx950,256,7025,6144,2048,cktile,11,0,146.0693,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1210.32,775.61,0.0 +gfx950,256,7121,6144,2048,cktile,27,0,147.2171,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1217.29,778.91,0.0 +gfx950,256,7169,6144,2048,cktile,11,0,147.1764,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1225.83,783.81,0.0 +gfx950,256,7177,6144,2048,cktile,27,0,146.8906,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1229.59,786.11,0.0 +gfx950,256,7217,6144,2048,cktile,11,0,147.4378,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1231.85,787.08,0.0 +gfx950,256,7223,6144,2048,cktile,11,0,146.5161,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1240.63,792.62,0.0 +gfx950,256,7238,6144,2048,cktile,28,0,146.2391,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1245.56,795.59,0.0 +gfx950,256,7246,6144,2048,cktile,28,0,146.4743,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1244.94,795.1,0.0 +gfx950,256,7249,6144,2048,cktile,27,0,146.332,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1246.67,796.17,0.0 +gfx950,256,7316,6144,2048,cktile,28,0,148.0019,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1243.99,793.67,0.0 +gfx950,256,7393,6144,2048,cktile,11,0,148.1505,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1255.82,800.33,0.0 +gfx950,256,7491,6144,2048,cktile,27,0,148.6239,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1268.42,807.23,0.0 +gfx950,256,7558,6144,2048,cktile,28,0,148.3728,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1281.93,815.07,0.0 +gfx950,256,7650,6144,2048,cktile,29,0,149.0133,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1291.96,820.42,0.0 +gfx950,256,7753,6144,2048,cktile,11,0,148.1962,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1316.57,834.91,0.0 +gfx950,256,7774,6144,2048,cktile,11,0,148.3358,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1318.89,836.15,0.0 +gfx950,256,7815,6144,2048,cktile,28,0,148.4638,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1324.71,839.39,0.0 +gfx950,256,7834,6144,2048,cktile,29,0,148.1223,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1330.99,843.16,0.0 +gfx950,256,7836,6144,2048,cktile,11,0,148.1999,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1330.63,842.91,0.0 +gfx950,256,7871,6144,2048,cktile,29,0,147.6947,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1341.15,849.19,0.0 +gfx950,256,7899,6144,2048,cktile,28,0,148.6611,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1337.17,846.37,0.0 +gfx950,256,7913,6144,2048,cktile,28,0,147.9143,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1346.3,852.0,0.0 +gfx950,256,7930,6144,2048,cktile,11,0,148.5611,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1343.32,849.94,0.0 +gfx950,256,7961,6144,2048,cktile,27,0,147.9914,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1353.76,856.21,0.0 +gfx950,256,7963,6144,2048,cktile,28,0,148.879,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1346.03,851.3,0.0 +gfx950,256,7974,6144,2048,cktile,29,0,148.4726,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1351.58,854.69,0.0 +gfx950,256,8016,6144,2048,cktile,27,0,148.7386,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1356.27,857.21,0.0 +gfx950,256,8017,6144,2048,cktile,11,0,148.2054,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1361.32,860.39,0.0 +gfx950,256,8103,6144,2048,cktile,29,0,164.6707,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1238.34,781.85,0.0 +gfx950,256,8128,6144,2048,cktile,29,0,164.3651,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1244.47,785.48,0.0 +gfx950,256,8129,6144,2048,cktile,28,0,164.0366,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1247.12,787.14,0.0 +gfx950,256,8150,6144,2048,cktile,11,0,163.9558,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1250.96,789.37,0.0 +gfx950,256,8184,6144,2048,cktile,27,0,163.8378,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1257.08,792.91,0.0 +gfx950,256,8185,6144,2048,cktile,29,0,164.7786,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1250.05,788.47,0.0 +gfx950,256,8192,6144,2048,cktile,27,0,162.1342,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1271.53,801.95,0.0 +gfx950,256,1,9216,2048,ck,8,3,8.8431,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.27,2136.68,0.0 +gfx950,256,2,9216,2048,ck,8,0,8.8707,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,8.51,2132.34,0.0 +gfx950,256,4,9216,2048,ck,8,0,8.8727,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,17.02,2136.47,0.0 +gfx950,256,8,9216,2048,ck,8,0,8.934,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,33.8,2130.98,0.0 +gfx950,256,16,9216,2048,ck,8,0,8.7072,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,69.37,2205.31,0.0 +gfx950,256,32,9216,2048,ck,7,0,10.4857,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,115.2,1862.51,0.0 +gfx950,256,48,9216,2048,ck,7,0,10.5927,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,171.06,1874.63,0.0 +gfx950,256,64,9216,2048,ck,18,0,10.3626,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,233.14,1947.88,0.0 +gfx950,256,80,9216,2048,ck,12,0,13.1732,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,229.25,1557.16,0.0 +gfx950,256,96,9216,2048,ck,12,0,12.2582,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,295.63,1700.12,0.0 +gfx950,256,112,9216,2048,ck,5,0,15.2864,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,276.58,1384.77,0.0 +gfx950,256,128,9216,2048,ck,2,0,15.5977,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,309.78,1378.14,0.0 +gfx950,256,256,9216,2048,ck,0,0,19.5869,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,493.37,1231.29,0.0 +gfx950,256,512,9216,2048,ck,2,0,29.5011,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,655.14,995.22,0.0 +gfx950,256,667,9216,2048,ck,0,0,32.4893,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,774.98,1001.39,0.0 +gfx950,256,671,9216,2048,ck,0,0,32.471,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,780.06,1004.48,0.0 +gfx950,256,931,9216,2048,cktile,32,0,39.2748,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,894.83,966.05,0.0 +gfx950,256,1024,9216,2048,cktile,30,0,39.7171,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,973.25,1003.24,0.0 +gfx950,256,1025,9216,2048,cktile,12,0,40.0882,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,965.18,994.47,0.0 +gfx950,256,1027,9216,2048,cktile,12,0,39.8563,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,972.69,1001.28,0.0 +gfx950,256,1031,9216,2048,cktile,30,0,39.9577,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,974.0,1000.79,0.0 +gfx950,256,1039,9216,2048,cktile,32,0,39.9331,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,982.17,1005.51,0.0 +gfx950,256,1040,9216,2048,cktile,12,0,39.9746,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,982.09,1004.98,0.0 +gfx950,256,1041,9216,2048,cktile,31,0,39.8433,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,986.27,1008.8,0.0 +gfx950,256,1072,9216,2048,cktile,12,0,39.8667,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1015.05,1024.14,0.0 +gfx950,256,1073,9216,2048,cktile,12,0,40.2558,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1_aqrm,1006.18,1014.75,0.0 +gfx950,256,1077,9216,2048,cktile,32,0,40.1581,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1012.38,1019.25,0.0 +gfx950,256,1091,9216,2048,cktile,31,0,40.3859,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3_aqrm,1019.76,1020.6,0.0 +gfx950,256,1343,9216,2048,cktile,32,0,41.5051,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4_aqrm,1221.45,1117.43,0.0 +gfx950,256,1688,9216,2048,ck,0,0,62.2297,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1023.95,858.83,0.0 +gfx950,256,2017,9216,2048,ck,0,0,75.1043,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1013.78,801.32,0.0 +gfx950,256,2048,9216,2048,ck,0,0,73.9583,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1045.31,822.32,0.0 +gfx950,256,2050,9216,2048,ck,0,0,75.5639,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1024.1,805.39,0.0 +gfx950,256,2063,9216,2048,ck,0,0,75.6899,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1028.88,807.57,0.0 +gfx950,256,2064,9216,2048,ck,0,0,75.4343,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1032.86,810.57,0.0 +gfx950,256,2099,9216,2048,ck,0,0,76.1664,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1040.28,812.19,0.0 +gfx950,256,2159,9216,2048,cktile,29,0,76.834,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1060.72,821.13,0.0 +gfx950,256,2160,9216,2048,cktile,11,0,77.0103,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1058.78,819.52,0.0 +gfx950,256,2313,9216,2048,cktile,11,0,77.532,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1126.15,854.42,0.0 +gfx950,256,2325,9216,2048,cktile,27,0,77.937,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1126.11,853.13,0.0 +gfx950,256,3072,9216,2048,ck,0,0,103.019,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1125.66,793.92,0.0 +gfx950,256,3073,9216,2048,cktile,11,0,106.1026,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1093.3,771.04,0.0 +gfx950,256,3182,9216,2048,cktile,27,0,108.1019,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1111.14,777.43,0.0 +gfx950,256,3183,9216,2048,cktile,11,0,108.3665,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1108.78,775.72,0.0 +gfx950,256,3847,9216,2048,cktile,29,0,112.7557,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1287.91,866.13,0.0 +gfx950,256,4096,9216,2048,ck,0,0,129.1944,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1196.79,795.39,0.0 +gfx950,256,4111,9216,2048,cktile,28,0,126.8622,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1223.26,812.44,0.0 +gfx950,256,4141,9216,2048,cktile,29,0,127.4174,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1226.81,813.72,0.0 +gfx950,256,4142,9216,2048,cktile,11,0,125.9118,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1241.78,823.61,0.0 +gfx950,256,4345,9216,2048,cktile,27,0,133.6991,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1226.77,806.74,0.0 +gfx950,256,5105,9216,2048,cktile,27,0,145.3818,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1325.53,848.97,0.0 +gfx950,256,5121,9216,2048,cktile,27,0,146.038,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1323.71,847.4,0.0 +gfx950,256,5123,9216,2048,cktile,11,0,145.8598,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1325.84,848.72,0.0 +gfx950,256,5131,9216,2048,cktile,28,0,145.75,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1328.91,850.48,0.0 +gfx950,256,5427,9216,2048,cktile,28,0,155.1798,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1320.16,837.86,0.0 +gfx950,256,6113,9216,2048,cktile,27,0,176.2697,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1309.12,817.32,0.0 +gfx950,256,6197,9216,2048,cktile,11,0,178.8734,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1307.79,815.04,0.0 +gfx950,256,6409,9216,2048,cktile,28,0,179.6276,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1346.85,835.79,0.0 +gfx950,256,7121,9216,2048,cktile,11,0,199.0209,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1350.66,827.61,0.0 +gfx950,256,7168,9216,2048,cktile,29,0,200.9921,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1346.24,824.29,0.0 +gfx950,256,7169,9216,2048,cktile,11,0,197.9219,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1367.31,837.18,0.0 +gfx950,256,7172,9216,2048,cktile,11,0,199.0792,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1359.93,832.62,0.0 +gfx950,256,7176,9216,2048,cktile,28,0,199.3176,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1359.06,832.03,0.0 +gfx950,256,7177,9216,2048,cktile,11,0,197.8645,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1369.23,838.25,0.0 +gfx950,256,7178,9216,2048,cktile,29,0,198.4669,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1365.27,835.81,0.0 +gfx950,256,7179,9216,2048,cktile,27,0,198.1989,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1367.3,837.04,0.0 +gfx950,256,7183,9216,2048,cktile,29,0,198.8009,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1363.92,834.92,0.0 +gfx950,256,7184,9216,2048,cktile,29,0,198.9317,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1363.22,834.47,0.0 +gfx950,256,7209,9216,2048,cktile,11,0,198.5542,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1370.56,838.64,0.0 +gfx950,256,7210,9216,2048,cktile,27,0,199.1638,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1366.56,836.17,0.0 +gfx950,256,7211,9216,2048,cktile,11,0,198.5734,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1370.81,838.76,0.0 +gfx950,256,7217,9216,2048,cktile,11,0,199.0679,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1368.54,837.29,0.0 +gfx950,256,7273,9216,2048,cktile,11,0,198.6939,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1381.76,844.64,0.0 +gfx950,256,7274,9216,2048,cktile,11,0,198.0307,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1386.57,847.58,0.0 +gfx950,256,7391,9216,2048,cktile,11,0,205.4884,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1357.75,828.48,0.0 +gfx950,256,7393,9216,2048,cktile,11,0,206.0379,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1354.49,826.46,0.0 +gfx950,256,7715,9216,2048,cktile,11,0,211.9611,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1373.99,834.48,0.0 +gfx950,256,7753,9216,2048,cktile,27,0,211.5014,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1383.75,839.97,0.0 +gfx950,256,7819,9216,2048,cktile,29,0,211.8379,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1393.32,845.02,0.0 +gfx950,256,8099,9216,2048,cktile,28,0,214.8028,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1423.29,860.05,0.0 +gfx950,256,8129,9216,2048,cktile,29,0,214.6299,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1429.71,863.61,0.0 +gfx950,256,8188,9216,2048,cktile,11,0,213.3814,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1448.52,874.32,0.0 +gfx950,256,8192,9216,2048,cktile,29,0,214.5798,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1441.13,869.82,0.0 +gfx950,256,1,12288,2048,ck,8,0,9.1068,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.53,2766.33,0.0 +gfx950,256,2,12288,2048,ck,8,0,9.0708,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,11.1,2780.25,0.0 +gfx950,256,4,12288,2048,ck,8,0,9.0952,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.14,2778.64,0.0 +gfx950,256,8,12288,2048,ck,8,0,9.1177,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,44.16,2783.47,0.0 +gfx950,256,16,12288,2048,ck,8,0,9.0887,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,88.61,2815.78,0.0 +gfx950,256,32,12288,2048,ck,7,0,10.928,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,147.38,2380.84,0.0 +gfx950,256,48,12288,2048,ck,18,0,11.9568,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,202.05,2211.61,0.0 +gfx950,256,64,12288,2048,ck,18,0,10.7205,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,300.47,2506.39,0.0 +gfx950,256,80,12288,2048,ck,2,0,15.6057,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,258.02,1749.09,0.0 +gfx950,256,96,12288,2048,ck,2,0,16.4739,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,293.3,1682.77,0.0 +gfx950,256,112,12288,2048,ck,18,0,17.0832,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,329.98,1647.68,0.0 +gfx950,256,128,12288,2048,ck,2,0,16.9352,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,380.42,1687.24,0.0 +gfx950,256,256,12288,2048,ck,0,0,19.9282,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,646.57,1604.84,0.0 +gfx950,256,512,12288,2048,ck,2,0,32.0846,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,803.18,1209.22,0.0 +gfx950,256,667,12288,2048,cktile,30,0,39.1959,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,856.5,1095.12,0.0 +gfx950,256,671,12288,2048,cktile,30,0,39.3043,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,859.26,1094.8,0.0 +gfx950,256,931,12288,2048,cktile,30,0,40.8482,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2_aqrm,1147.14,1222.89,0.0 +gfx950,256,1024,12288,2048,ck,0,0,50.111,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1028.51,1046.25,0.0 +gfx950,256,1025,12288,2048,ck,0,0,58.9775,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,874.74,889.41,0.0 +gfx950,256,1027,12288,2048,ck,0,0,59.1855,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,873.37,887.19,0.0 +gfx950,256,1031,12288,2048,ck,0,0,59.0863,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,878.24,890.48,0.0 +gfx950,256,1039,12288,2048,ck,0,0,59.3603,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,880.97,889.96,0.0 +gfx950,256,1040,12288,2048,ck,0,0,59.5499,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,879.01,887.57,0.0 +gfx950,256,1041,12288,2048,ck,0,0,58.8407,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,890.46,898.72,0.0 +gfx950,256,1072,12288,2048,ck,0,0,59.7447,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,903.1,898.94,0.0 +gfx950,256,1073,12288,2048,ck,0,0,59.8571,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,902.25,897.69,0.0 +gfx950,256,1077,12288,2048,ck,0,0,59.3015,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,914.09,907.9,0.0 +gfx950,256,1091,12288,2048,ck,0,0,59.7151,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,919.56,907.85,0.0 +gfx950,256,1343,12288,2048,ck,0,0,72.2876,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,935.09,842.77,0.0 +gfx950,256,1688,12288,2048,cktile,11,0,77.0801,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1102.23,909.54,0.0 +gfx950,256,2017,12288,2048,ck,0,0,91.2692,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1112.3,864.11,0.0 +gfx950,256,2048,12288,2048,ck,0,0,90.6604,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1136.98,879.01,0.0 +gfx950,256,2050,12288,2048,cktile,29,0,97.9354,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1053.55,814.26,0.0 +gfx950,256,2063,12288,2048,cktile,11,0,98.3354,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1055.92,814.47,0.0 +gfx950,256,2064,12288,2048,cktile,27,0,97.7764,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1062.47,819.4,0.0 +gfx950,256,2099,12288,2048,cktile,29,0,98.2888,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1074.85,824.61,0.0 +gfx950,256,2159,12288,2048,ck,0,0,99.6122,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1090.89,829.69,0.0 +gfx950,256,2160,12288,2048,ck,0,0,99.9066,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1088.18,827.51,0.0 +gfx950,256,2313,12288,2048,cktile,11,0,104.9035,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1109.75,826.92,0.0 +gfx950,256,2325,12288,2048,cktile,11,0,104.9753,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1114.75,829.4,0.0 +gfx950,256,3072,12288,2048,cktile,27,0,118.4377,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1305.49,903.05,0.0 +gfx950,256,3073,12288,2048,cktile,28,0,132.4202,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1168.02,807.89,0.0 +gfx950,256,3182,12288,2048,cktile,27,0,131.7296,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1215.79,834.16,0.0 +gfx950,256,3183,12288,2048,cktile,11,0,132.6358,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1207.86,828.66,0.0 +gfx950,256,3847,12288,2048,cktile,28,0,146.4536,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1322.1,871.19,0.0 +gfx950,256,4096,12288,2048,cktile,29,0,162.4369,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1269.16,826.28,0.0 +gfx950,256,4111,12288,2048,cktile,29,0,162.1653,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1275.94,830.12,0.0 +gfx950,256,4141,12288,2048,cktile,27,0,161.4237,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1291.16,838.88,0.0 +gfx950,256,4142,12288,2048,cktile,11,0,161.4537,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1291.23,838.89,0.0 +gfx950,256,4345,12288,2048,cktile,11,0,169.2982,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1291.75,831.95,0.0 +gfx950,256,5105,12288,2048,cktile,27,0,189.4564,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1356.21,850.23,0.0 +gfx950,256,5121,12288,2048,cktile,11,0,191.5732,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1345.43,843.06,0.0 +gfx950,256,5123,12288,2048,cktile,27,0,189.8892,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1357.89,850.81,0.0 +gfx950,256,5131,12288,2048,cktile,11,0,190.8192,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1353.38,847.78,0.0 +gfx950,256,5427,12288,2048,cktile,11,0,202.8021,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1346.88,836.55,0.0 +gfx950,256,6113,12288,2048,cktile,11,0,212.1395,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1450.35,885.82,0.0 +gfx950,256,6197,12288,2048,cktile,27,0,226.2389,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1378.65,840.5,0.0 +gfx950,256,6409,12288,2048,cktile,28,0,233.4953,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1381.51,838.56,0.0 +gfx950,256,7121,12288,2048,cktile,11,0,253.8566,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1411.87,845.97,0.0 +gfx950,256,7168,12288,2048,cktile,28,0,253.9465,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1420.68,850.6,0.0 +gfx950,256,7169,12288,2048,cktile,28,0,252.2434,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1430.47,856.45,0.0 +gfx950,256,7172,12288,2048,cktile,28,0,254.1843,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1420.15,850.22,0.0 +gfx950,256,7176,12288,2048,cktile,11,0,253.372,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1425.49,853.37,0.0 +gfx950,256,7177,12288,2048,cktile,29,0,253.8972,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1422.74,851.71,0.0 +gfx950,256,7178,12288,2048,cktile,28,0,252.5689,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1430.42,856.29,0.0 +gfx950,256,7179,12288,2048,cktile,29,0,252.1941,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1432.75,857.67,0.0 +gfx950,256,7183,12288,2048,cktile,28,0,252.3101,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1432.89,857.7,0.0 +gfx950,256,7184,12288,2048,cktile,11,0,253.0413,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1428.95,855.33,0.0 +gfx950,256,7209,12288,2048,cktile,29,0,253.213,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1432.95,857.37,0.0 +gfx950,256,7210,12288,2048,cktile,29,0,252.0357,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_4,1439.84,861.48,0.0 +gfx950,256,7211,12288,2048,cktile,11,0,252.141,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1439.44,861.23,0.0 +gfx950,256,7217,12288,2048,cktile,27,0,253.1175,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1435.08,858.54,0.0 +gfx950,256,7273,12288,2048,cktile,11,0,252.4367,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1450.11,866.76,0.0 +gfx950,256,7274,12288,2048,cktile,28,0,252.8247,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1448.09,865.54,0.0 +gfx950,256,7391,12288,2048,cktile,28,0,260.2428,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1429.44,852.83,0.0 +gfx950,256,7393,12288,2048,cktile,28,0,261.2632,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1424.24,849.71,0.0 +gfx950,256,7715,12288,2048,cktile,27,0,270.2448,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_2,1436.88,853.19,0.0 +gfx950,256,7753,12288,2048,cktile,11,0,273.9581,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1424.38,845.32,0.0 +gfx950,256,7819,12288,2048,cktile,28,0,273.5002,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1438.91,853.16,0.0 +gfx950,256,8099,12288,2048,cktile,11,0,282.4879,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1443.02,852.4,0.0 +gfx950,256,8129,12288,2048,cktile,11,0,281.2092,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1454.95,859.12,0.0 +gfx950,256,8188,12288,2048,cktile,11,0,280.3084,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1470.22,867.48,0.0 +gfx950,256,8192,12288,2048,cktile,28,0,281.7916,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_3,1463.2,863.3,0.0 diff --git a/aiter/configs/model_configs/a8w8_blockscale_untuned_gemm_qwen3_next_80b_a3b.csv b/aiter/configs/model_configs/a8w8_blockscale_untuned_gemm_qwen3_next_80b_a3b.csv new file mode 100644 index 0000000000..24aa0c72b9 --- /dev/null +++ b/aiter/configs/model_configs/a8w8_blockscale_untuned_gemm_qwen3_next_80b_a3b.csv @@ -0,0 +1,1483 @@ +M,N,K +1,256,2048 +1,1024,2048 +1,2048,128 +1,2048,512 +1,2048,1024 +1,2048,2048 +1,2048,4096 +1,2560,2048 +1,3072,2048 +1,4608,2048 +1,6144,2048 +1,9216,2048 +1,12288,2048 +2,256,2048 +2,1024,2048 +2,2048,128 +2,2048,512 +2,2048,1024 +2,2048,2048 +2,2048,4096 +2,2560,2048 +2,3072,2048 +2,4608,2048 +2,6144,2048 +2,9216,2048 +2,12288,2048 +4,256,2048 +4,1024,2048 +4,2048,128 +4,2048,512 +4,2048,1024 +4,2048,2048 +4,2048,4096 +4,2560,2048 +4,3072,2048 +4,4608,2048 +4,6144,2048 +4,9216,2048 +4,12288,2048 +8,256,2048 +8,1024,2048 +8,2048,128 +8,2048,512 +8,2048,1024 +8,2048,2048 +8,2048,4096 +8,2560,2048 +8,3072,2048 +8,4608,2048 +8,6144,2048 +8,9216,2048 +8,12288,2048 +16,256,2048 +16,1024,2048 +16,2048,128 +16,2048,512 +16,2048,1024 +16,2048,2048 +16,2048,4096 +16,2560,2048 +16,3072,2048 +16,4608,2048 +16,6144,2048 +16,9216,2048 +16,12288,2048 +24,2048,2048 +24,4608,2048 +24,6144,2048 +32,256,2048 +32,1024,2048 +32,2048,128 +32,2048,512 +32,2048,1024 +32,2048,2048 +32,2048,4096 +32,2560,2048 +32,3072,2048 +32,4608,2048 +32,6144,2048 +32,9216,2048 +32,12288,2048 +40,2048,2048 +40,4608,2048 +40,6144,2048 +48,256,2048 +48,1024,2048 +48,2048,128 +48,2048,512 +48,2048,1024 +48,2048,2048 +48,2048,4096 +48,2560,2048 +48,3072,2048 +48,4608,2048 +48,6144,2048 +48,9216,2048 +48,12288,2048 +56,2048,2048 +56,4608,2048 +56,6144,2048 +64,256,2048 +64,1024,2048 +64,2048,128 +64,2048,512 +64,2048,1024 +64,2048,2048 +64,2048,4096 +64,2560,2048 +64,3072,2048 +64,4608,2048 +64,6144,2048 +64,9216,2048 +64,12288,2048 +72,2048,2048 +72,4608,2048 +72,6144,2048 +80,256,2048 +80,1024,2048 +80,2048,128 +80,2048,512 +80,2048,1024 +80,2048,2048 +80,2048,4096 +80,2560,2048 +80,3072,2048 +80,4608,2048 +80,6144,2048 +80,9216,2048 +80,12288,2048 +88,2048,2048 +88,4608,2048 +88,6144,2048 +96,256,2048 +96,1024,2048 +96,2048,128 +96,2048,512 +96,2048,1024 +96,2048,2048 +96,2048,4096 +96,2560,2048 +96,3072,2048 +96,4608,2048 +96,6144,2048 +96,9216,2048 +96,12288,2048 +104,2048,2048 +104,4608,2048 +104,6144,2048 +112,256,2048 +112,1024,2048 +112,2048,128 +112,2048,512 +112,2048,1024 +112,2048,2048 +112,2048,4096 +112,2560,2048 +112,3072,2048 +112,4608,2048 +112,6144,2048 +112,9216,2048 +112,12288,2048 +120,2048,2048 +120,4608,2048 +120,6144,2048 +128,256,2048 +128,1024,2048 +128,2048,128 +128,2048,512 +128,2048,1024 +128,2048,2048 +128,2048,4096 +128,2560,2048 +128,3072,2048 +128,4608,2048 +128,6144,2048 +128,9216,2048 +128,12288,2048 +136,2048,2048 +136,4608,2048 +136,6144,2048 +144,2048,2048 +144,4608,2048 +144,6144,2048 +152,2048,2048 +152,4608,2048 +152,6144,2048 +160,2048,2048 +160,4608,2048 +160,6144,2048 +168,2048,2048 +168,4608,2048 +168,6144,2048 +176,2048,2048 +176,4608,2048 +176,6144,2048 +184,2048,2048 +184,4608,2048 +184,6144,2048 +192,2048,2048 +192,4608,2048 +192,6144,2048 +200,2048,2048 +200,4608,2048 +200,6144,2048 +208,2048,2048 +208,4608,2048 +208,6144,2048 +216,2048,2048 +216,4608,2048 +216,6144,2048 +224,2048,2048 +224,4608,2048 +224,6144,2048 +232,2048,2048 +232,4608,2048 +232,6144,2048 +240,2048,2048 +240,4608,2048 +240,6144,2048 +248,2048,2048 +248,4608,2048 +248,6144,2048 +256,256,2048 +256,1024,2048 +256,2048,128 +256,2048,512 +256,2048,1024 +256,2048,2048 +256,2048,4096 +256,2560,2048 +256,3072,2048 +256,4608,2048 +256,6144,2048 +256,9216,2048 +256,12288,2048 +272,2048,2048 +272,4608,2048 +272,6144,2048 +288,2048,2048 +288,4608,2048 +288,6144,2048 +304,2048,2048 +304,4608,2048 +304,6144,2048 +320,2048,2048 +320,4608,2048 +320,6144,2048 +336,2048,2048 +336,4608,2048 +336,6144,2048 +352,2048,2048 +352,4608,2048 +352,6144,2048 +368,2048,2048 +368,4608,2048 +368,6144,2048 +384,2048,2048 +384,4608,2048 +384,6144,2048 +400,2048,2048 +400,4608,2048 +400,6144,2048 +416,2048,2048 +416,4608,2048 +416,6144,2048 +432,2048,2048 +432,4608,2048 +432,6144,2048 +448,2048,2048 +448,4608,2048 +448,6144,2048 +464,2048,2048 +464,4608,2048 +464,6144,2048 +480,2048,2048 +480,4608,2048 +480,6144,2048 +496,2048,2048 +496,4608,2048 +496,6144,2048 +512,256,2048 +512,1024,2048 +512,2048,128 +512,2048,512 +512,2048,1024 +512,2048,2048 +512,2048,4096 +512,2560,2048 +512,3072,2048 +512,4608,2048 +512,6144,2048 +512,9216,2048 +512,12288,2048 +588,256,2048 +588,2048,128 +588,2048,1024 +588,2560,2048 +588,3072,2048 +664,2048,2048 +664,4608,2048 +664,6144,2048 +667,1024,2048 +667,2048,512 +667,2048,4096 +667,9216,2048 +667,12288,2048 +671,1024,2048 +671,2048,512 +671,2048,4096 +671,9216,2048 +671,12288,2048 +762,2048,2048 +762,4608,2048 +762,6144,2048 +773,256,2048 +773,2048,128 +773,2048,1024 +773,2560,2048 +773,3072,2048 +822,256,2048 +822,2048,128 +822,2048,1024 +822,2560,2048 +822,3072,2048 +902,2048,2048 +902,4608,2048 +902,6144,2048 +931,1024,2048 +931,2048,512 +931,2048,4096 +931,9216,2048 +931,12288,2048 +961,2048,2048 +961,4608,2048 +961,6144,2048 +1000,2048,2048 +1000,4608,2048 +1000,6144,2048 +1001,2048,2048 +1001,4608,2048 +1001,6144,2048 +1002,2048,2048 +1002,4608,2048 +1002,6144,2048 +1003,2048,2048 +1003,4608,2048 +1003,6144,2048 +1015,2048,2048 +1015,4608,2048 +1015,6144,2048 +1023,2048,2048 +1023,4608,2048 +1023,6144,2048 +1024,256,2048 +1024,1024,2048 +1024,2048,128 +1024,2048,512 +1024,2048,1024 +1024,2048,2048 +1024,2048,4096 +1024,2560,2048 +1024,3072,2048 +1024,4608,2048 +1024,6144,2048 +1024,9216,2048 +1024,12288,2048 +1025,256,2048 +1025,1024,2048 +1025,2048,128 +1025,2048,512 +1025,2048,1024 +1025,2048,2048 +1025,2048,4096 +1025,2560,2048 +1025,3072,2048 +1025,4608,2048 +1025,6144,2048 +1025,9216,2048 +1025,12288,2048 +1027,256,2048 +1027,1024,2048 +1027,2048,128 +1027,2048,512 +1027,2048,1024 +1027,2048,4096 +1027,2560,2048 +1027,3072,2048 +1027,9216,2048 +1027,12288,2048 +1031,1024,2048 +1031,2048,512 +1031,2048,4096 +1031,9216,2048 +1031,12288,2048 +1032,2048,2048 +1032,4608,2048 +1032,6144,2048 +1038,2048,2048 +1038,4608,2048 +1038,6144,2048 +1039,1024,2048 +1039,2048,512 +1039,2048,4096 +1039,9216,2048 +1039,12288,2048 +1040,1024,2048 +1040,2048,512 +1040,2048,4096 +1040,9216,2048 +1040,12288,2048 +1041,1024,2048 +1041,2048,512 +1041,2048,4096 +1041,9216,2048 +1041,12288,2048 +1042,256,2048 +1042,2048,128 +1042,2048,1024 +1042,2560,2048 +1042,3072,2048 +1051,256,2048 +1051,2048,128 +1051,2048,1024 +1051,2560,2048 +1051,3072,2048 +1055,256,2048 +1055,2048,128 +1055,2048,1024 +1055,2560,2048 +1055,3072,2048 +1057,256,2048 +1057,2048,128 +1057,2048,1024 +1057,2560,2048 +1057,3072,2048 +1069,256,2048 +1069,2048,128 +1069,2048,1024 +1069,2560,2048 +1069,3072,2048 +1072,256,2048 +1072,1024,2048 +1072,2048,128 +1072,2048,512 +1072,2048,1024 +1072,2048,4096 +1072,2560,2048 +1072,3072,2048 +1072,9216,2048 +1072,12288,2048 +1073,1024,2048 +1073,2048,512 +1073,2048,4096 +1073,9216,2048 +1073,12288,2048 +1074,256,2048 +1074,2048,128 +1074,2048,1024 +1074,2560,2048 +1074,3072,2048 +1077,1024,2048 +1077,2048,512 +1077,2048,4096 +1077,9216,2048 +1077,12288,2048 +1087,2048,2048 +1087,4608,2048 +1087,6144,2048 +1091,256,2048 +1091,1024,2048 +1091,2048,128 +1091,2048,512 +1091,2048,1024 +1091,2048,4096 +1091,2560,2048 +1091,3072,2048 +1091,9216,2048 +1091,12288,2048 +1128,256,2048 +1128,2048,128 +1128,2048,1024 +1128,2560,2048 +1128,3072,2048 +1136,256,2048 +1136,2048,128 +1136,2048,1024 +1136,2560,2048 +1136,3072,2048 +1255,2048,2048 +1255,4608,2048 +1255,6144,2048 +1343,1024,2048 +1343,2048,512 +1343,2048,4096 +1343,9216,2048 +1343,12288,2048 +1351,2048,2048 +1351,4608,2048 +1351,6144,2048 +1376,2048,2048 +1376,4608,2048 +1376,6144,2048 +1380,2048,2048 +1380,4608,2048 +1380,6144,2048 +1391,2048,2048 +1391,4608,2048 +1391,6144,2048 +1514,256,2048 +1514,2048,128 +1514,2048,1024 +1514,2560,2048 +1514,3072,2048 +1688,1024,2048 +1688,2048,512 +1688,2048,4096 +1688,9216,2048 +1688,12288,2048 +1710,2048,2048 +1710,4608,2048 +1710,6144,2048 +1808,2048,2048 +1808,4608,2048 +1808,6144,2048 +1809,2048,2048 +1809,4608,2048 +1809,6144,2048 +1870,2048,2048 +1870,4608,2048 +1870,6144,2048 +1967,2048,2048 +1967,4608,2048 +1967,6144,2048 +1970,2048,2048 +1970,4608,2048 +1970,6144,2048 +2000,2048,2048 +2000,4608,2048 +2000,6144,2048 +2001,2048,2048 +2001,4608,2048 +2001,6144,2048 +2016,2048,2048 +2016,4608,2048 +2016,6144,2048 +2017,256,2048 +2017,1024,2048 +2017,2048,128 +2017,2048,512 +2017,2048,1024 +2017,2048,2048 +2017,2048,4096 +2017,2560,2048 +2017,3072,2048 +2017,4608,2048 +2017,6144,2048 +2017,9216,2048 +2017,12288,2048 +2046,2048,2048 +2046,4608,2048 +2046,6144,2048 +2048,256,2048 +2048,1024,2048 +2048,2048,128 +2048,2048,512 +2048,2048,1024 +2048,2048,2048 +2048,2048,4096 +2048,2560,2048 +2048,3072,2048 +2048,4608,2048 +2048,6144,2048 +2048,9216,2048 +2048,12288,2048 +2050,1024,2048 +2050,2048,512 +2050,2048,2048 +2050,2048,4096 +2050,4608,2048 +2050,6144,2048 +2050,9216,2048 +2050,12288,2048 +2063,1024,2048 +2063,2048,512 +2063,2048,4096 +2063,9216,2048 +2063,12288,2048 +2064,1024,2048 +2064,2048,512 +2064,2048,4096 +2064,9216,2048 +2064,12288,2048 +2081,2048,2048 +2081,4608,2048 +2081,6144,2048 +2095,2048,2048 +2095,4608,2048 +2095,6144,2048 +2099,1024,2048 +2099,2048,512 +2099,2048,4096 +2099,9216,2048 +2099,12288,2048 +2151,256,2048 +2151,2048,128 +2151,2048,1024 +2151,2560,2048 +2151,3072,2048 +2154,2048,2048 +2154,4608,2048 +2154,6144,2048 +2159,1024,2048 +2159,2048,512 +2159,2048,2048 +2159,2048,4096 +2159,4608,2048 +2159,6144,2048 +2159,9216,2048 +2159,12288,2048 +2160,1024,2048 +2160,2048,512 +2160,2048,4096 +2160,9216,2048 +2160,12288,2048 +2162,2048,2048 +2162,4608,2048 +2162,6144,2048 +2165,2048,2048 +2165,4608,2048 +2165,6144,2048 +2251,2048,2048 +2251,4608,2048 +2251,6144,2048 +2252,2048,2048 +2252,4608,2048 +2252,6144,2048 +2255,2048,2048 +2255,4608,2048 +2255,6144,2048 +2313,1024,2048 +2313,2048,512 +2313,2048,4096 +2313,9216,2048 +2313,12288,2048 +2317,2048,2048 +2317,4608,2048 +2317,6144,2048 +2325,1024,2048 +2325,2048,512 +2325,2048,4096 +2325,9216,2048 +2325,12288,2048 +2332,256,2048 +2332,2048,128 +2332,2048,1024 +2332,2560,2048 +2332,3072,2048 +2341,256,2048 +2341,2048,128 +2341,2048,1024 +2341,2560,2048 +2341,3072,2048 +2694,2048,2048 +2694,4608,2048 +2694,6144,2048 +2715,2048,2048 +2715,4608,2048 +2715,6144,2048 +2819,2048,2048 +2819,4608,2048 +2819,6144,2048 +2853,2048,2048 +2853,4608,2048 +2853,6144,2048 +2914,2048,2048 +2914,4608,2048 +2914,6144,2048 +2974,2048,2048 +2974,4608,2048 +2974,6144,2048 +3000,2048,2048 +3000,4608,2048 +3000,6144,2048 +3001,2048,2048 +3001,4608,2048 +3001,6144,2048 +3072,256,2048 +3072,1024,2048 +3072,2048,128 +3072,2048,512 +3072,2048,1024 +3072,2048,2048 +3072,2048,4096 +3072,2560,2048 +3072,3072,2048 +3072,4608,2048 +3072,6144,2048 +3072,9216,2048 +3072,12288,2048 +3073,256,2048 +3073,1024,2048 +3073,2048,128 +3073,2048,512 +3073,2048,1024 +3073,2048,2048 +3073,2048,4096 +3073,2560,2048 +3073,3072,2048 +3073,4608,2048 +3073,6144,2048 +3073,9216,2048 +3073,12288,2048 +3077,2048,2048 +3077,4608,2048 +3077,6144,2048 +3078,2048,2048 +3078,4608,2048 +3078,6144,2048 +3086,2048,2048 +3086,4608,2048 +3086,6144,2048 +3089,256,2048 +3089,2048,128 +3089,2048,1024 +3089,2048,2048 +3089,2560,2048 +3089,3072,2048 +3089,4608,2048 +3089,6144,2048 +3110,256,2048 +3110,2048,128 +3110,2048,1024 +3110,2560,2048 +3110,3072,2048 +3118,2048,2048 +3118,4608,2048 +3118,6144,2048 +3137,256,2048 +3137,2048,128 +3137,2048,1024 +3137,2560,2048 +3137,3072,2048 +3138,2048,2048 +3138,4608,2048 +3138,6144,2048 +3154,2048,2048 +3154,4608,2048 +3154,6144,2048 +3182,1024,2048 +3182,2048,512 +3182,2048,4096 +3182,9216,2048 +3182,12288,2048 +3183,1024,2048 +3183,2048,512 +3183,2048,4096 +3183,9216,2048 +3183,12288,2048 +3289,2048,2048 +3289,4608,2048 +3289,6144,2048 +3437,2048,2048 +3437,4608,2048 +3437,6144,2048 +3613,256,2048 +3613,2048,128 +3613,2048,1024 +3613,2560,2048 +3613,3072,2048 +3617,2048,2048 +3617,4608,2048 +3617,6144,2048 +3620,2048,2048 +3620,4608,2048 +3620,6144,2048 +3669,2048,2048 +3669,4608,2048 +3669,6144,2048 +3824,2048,2048 +3824,4608,2048 +3824,6144,2048 +3847,1024,2048 +3847,2048,512 +3847,2048,4096 +3847,9216,2048 +3847,12288,2048 +3858,2048,2048 +3858,4608,2048 +3858,6144,2048 +3906,2048,2048 +3906,4608,2048 +3906,6144,2048 +4000,2048,2048 +4000,4608,2048 +4000,6144,2048 +4004,2048,2048 +4004,4608,2048 +4004,6144,2048 +4096,256,2048 +4096,1024,2048 +4096,2048,128 +4096,2048,512 +4096,2048,1024 +4096,2048,4096 +4096,2560,2048 +4096,3072,2048 +4096,9216,2048 +4096,12288,2048 +4097,256,2048 +4097,2048,128 +4097,2048,1024 +4097,2560,2048 +4097,3072,2048 +4100,2048,2048 +4100,4608,2048 +4100,6144,2048 +4108,2048,2048 +4108,4608,2048 +4108,6144,2048 +4109,256,2048 +4109,2048,128 +4109,2048,1024 +4109,2560,2048 +4109,3072,2048 +4111,1024,2048 +4111,2048,512 +4111,2048,4096 +4111,9216,2048 +4111,12288,2048 +4121,2048,2048 +4121,4608,2048 +4121,6144,2048 +4123,2048,2048 +4123,4608,2048 +4123,6144,2048 +4141,1024,2048 +4141,2048,512 +4141,2048,4096 +4141,9216,2048 +4141,12288,2048 +4142,1024,2048 +4142,2048,512 +4142,2048,4096 +4142,9216,2048 +4142,12288,2048 +4143,256,2048 +4143,2048,128 +4143,2048,1024 +4143,2560,2048 +4143,3072,2048 +4149,2048,2048 +4149,4608,2048 +4149,6144,2048 +4204,2048,2048 +4204,4608,2048 +4204,6144,2048 +4223,2048,2048 +4223,4608,2048 +4223,6144,2048 +4229,2048,2048 +4229,4608,2048 +4229,6144,2048 +4345,256,2048 +4345,1024,2048 +4345,2048,128 +4345,2048,512 +4345,2048,1024 +4345,2048,4096 +4345,2560,2048 +4345,3072,2048 +4345,9216,2048 +4345,12288,2048 +4379,2048,2048 +4379,4608,2048 +4379,6144,2048 +4410,2048,2048 +4410,4608,2048 +4410,6144,2048 +4413,256,2048 +4413,2048,128 +4413,2048,1024 +4413,2560,2048 +4413,3072,2048 +4498,2048,2048 +4498,4608,2048 +4498,6144,2048 +4585,2048,2048 +4585,4608,2048 +4585,6144,2048 +4586,2048,2048 +4586,4608,2048 +4586,6144,2048 +4723,2048,2048 +4723,4608,2048 +4723,6144,2048 +4743,2048,2048 +4743,4608,2048 +4743,6144,2048 +4822,2048,2048 +4822,4608,2048 +4822,6144,2048 +4898,2048,2048 +4898,4608,2048 +4898,6144,2048 +4936,2048,2048 +4936,4608,2048 +4936,6144,2048 +4992,2048,2048 +4992,4608,2048 +4992,6144,2048 +5000,2048,2048 +5000,4608,2048 +5000,6144,2048 +5001,2048,2048 +5001,4608,2048 +5001,6144,2048 +5002,2048,2048 +5002,4608,2048 +5002,6144,2048 +5003,2048,2048 +5003,4608,2048 +5003,6144,2048 +5004,2048,2048 +5004,4608,2048 +5004,6144,2048 +5005,2048,2048 +5005,4608,2048 +5005,6144,2048 +5006,2048,2048 +5006,4608,2048 +5006,6144,2048 +5010,2048,2048 +5010,4608,2048 +5010,6144,2048 +5011,2048,2048 +5011,4608,2048 +5011,6144,2048 +5012,2048,2048 +5012,4608,2048 +5012,6144,2048 +5026,2048,2048 +5026,4608,2048 +5026,6144,2048 +5028,2048,2048 +5028,4608,2048 +5028,6144,2048 +5030,2048,2048 +5030,4608,2048 +5030,6144,2048 +5031,2048,2048 +5031,4608,2048 +5031,6144,2048 +5060,2048,2048 +5060,4608,2048 +5060,6144,2048 +5062,2048,2048 +5062,4608,2048 +5062,6144,2048 +5063,2048,2048 +5063,4608,2048 +5063,6144,2048 +5091,2048,2048 +5091,4608,2048 +5091,6144,2048 +5105,1024,2048 +5105,2048,512 +5105,2048,2048 +5105,2048,4096 +5105,4608,2048 +5105,6144,2048 +5105,9216,2048 +5105,12288,2048 +5120,2048,2048 +5120,4608,2048 +5120,6144,2048 +5121,256,2048 +5121,1024,2048 +5121,2048,128 +5121,2048,512 +5121,2048,1024 +5121,2048,4096 +5121,2560,2048 +5121,3072,2048 +5121,9216,2048 +5121,12288,2048 +5122,2048,2048 +5122,4608,2048 +5122,6144,2048 +5123,256,2048 +5123,1024,2048 +5123,2048,128 +5123,2048,512 +5123,2048,1024 +5123,2048,2048 +5123,2048,4096 +5123,2560,2048 +5123,3072,2048 +5123,4608,2048 +5123,6144,2048 +5123,9216,2048 +5123,12288,2048 +5124,2048,2048 +5124,4608,2048 +5124,6144,2048 +5127,2048,2048 +5127,4608,2048 +5127,6144,2048 +5131,1024,2048 +5131,2048,512 +5131,2048,4096 +5131,9216,2048 +5131,12288,2048 +5133,256,2048 +5133,2048,128 +5133,2048,1024 +5133,2560,2048 +5133,3072,2048 +5177,2048,2048 +5177,4608,2048 +5177,6144,2048 +5183,2048,2048 +5183,4608,2048 +5183,6144,2048 +5185,256,2048 +5185,2048,128 +5185,2048,1024 +5185,2560,2048 +5185,3072,2048 +5191,256,2048 +5191,2048,128 +5191,2048,1024 +5191,2560,2048 +5191,3072,2048 +5252,2048,2048 +5252,4608,2048 +5252,6144,2048 +5255,2048,2048 +5255,4608,2048 +5255,6144,2048 +5361,2048,2048 +5361,4608,2048 +5361,6144,2048 +5427,1024,2048 +5427,2048,512 +5427,2048,2048 +5427,2048,4096 +5427,4608,2048 +5427,6144,2048 +5427,9216,2048 +5427,12288,2048 +5471,2048,2048 +5471,4608,2048 +5471,6144,2048 +5534,2048,2048 +5534,4608,2048 +5534,6144,2048 +5564,2048,2048 +5564,4608,2048 +5564,6144,2048 +5671,2048,2048 +5671,4608,2048 +5671,6144,2048 +5927,2048,2048 +5927,4608,2048 +5927,6144,2048 +5954,2048,2048 +5954,4608,2048 +5954,6144,2048 +6000,2048,2048 +6000,4608,2048 +6000,6144,2048 +6002,2048,2048 +6002,4608,2048 +6002,6144,2048 +6113,1024,2048 +6113,2048,512 +6113,2048,4096 +6113,9216,2048 +6113,12288,2048 +6146,256,2048 +6146,2048,128 +6146,2048,1024 +6146,2048,2048 +6146,2560,2048 +6146,3072,2048 +6146,4608,2048 +6146,6144,2048 +6147,256,2048 +6147,2048,128 +6147,2048,1024 +6147,2560,2048 +6147,3072,2048 +6155,256,2048 +6155,2048,128 +6155,2048,1024 +6155,2560,2048 +6155,3072,2048 +6194,2048,2048 +6194,4608,2048 +6194,6144,2048 +6197,1024,2048 +6197,2048,512 +6197,2048,4096 +6197,9216,2048 +6197,12288,2048 +6227,2048,2048 +6227,4608,2048 +6227,6144,2048 +6229,2048,2048 +6229,4608,2048 +6229,6144,2048 +6234,2048,2048 +6234,4608,2048 +6234,6144,2048 +6246,2048,2048 +6246,4608,2048 +6246,6144,2048 +6304,2048,2048 +6304,4608,2048 +6304,6144,2048 +6314,2048,2048 +6314,4608,2048 +6314,6144,2048 +6318,2048,2048 +6318,4608,2048 +6318,6144,2048 +6341,2048,2048 +6341,4608,2048 +6341,6144,2048 +6377,256,2048 +6377,2048,128 +6377,2048,1024 +6377,2048,2048 +6377,2560,2048 +6377,3072,2048 +6377,4608,2048 +6377,6144,2048 +6401,256,2048 +6401,2048,128 +6401,2048,1024 +6401,2560,2048 +6401,3072,2048 +6409,1024,2048 +6409,2048,512 +6409,2048,4096 +6409,9216,2048 +6409,12288,2048 +6476,2048,2048 +6476,4608,2048 +6476,6144,2048 +6483,2048,2048 +6483,4608,2048 +6483,6144,2048 +6492,2048,2048 +6492,4608,2048 +6492,6144,2048 +6519,2048,2048 +6519,4608,2048 +6519,6144,2048 +6609,2048,2048 +6609,4608,2048 +6609,6144,2048 +6811,2048,2048 +6811,4608,2048 +6811,6144,2048 +6813,2048,2048 +6813,4608,2048 +6813,6144,2048 +6818,2048,2048 +6818,4608,2048 +6818,6144,2048 +6830,2048,2048 +6830,4608,2048 +6830,6144,2048 +7001,2048,2048 +7001,4608,2048 +7001,6144,2048 +7025,2048,2048 +7025,4608,2048 +7025,6144,2048 +7121,256,2048 +7121,1024,2048 +7121,2048,128 +7121,2048,512 +7121,2048,1024 +7121,2048,2048 +7121,2048,4096 +7121,2560,2048 +7121,3072,2048 +7121,4608,2048 +7121,6144,2048 +7121,9216,2048 +7121,12288,2048 +7168,1024,2048 +7168,2048,512 +7168,2048,4096 +7168,9216,2048 +7168,12288,2048 +7169,256,2048 +7169,1024,2048 +7169,2048,128 +7169,2048,512 +7169,2048,1024 +7169,2048,2048 +7169,2048,4096 +7169,2560,2048 +7169,3072,2048 +7169,4608,2048 +7169,6144,2048 +7169,9216,2048 +7169,12288,2048 +7170,256,2048 +7170,2048,128 +7170,2048,1024 +7170,2560,2048 +7170,3072,2048 +7172,1024,2048 +7172,2048,512 +7172,2048,4096 +7172,9216,2048 +7172,12288,2048 +7176,256,2048 +7176,1024,2048 +7176,2048,128 +7176,2048,512 +7176,2048,1024 +7176,2048,4096 +7176,2560,2048 +7176,3072,2048 +7176,9216,2048 +7176,12288,2048 +7177,256,2048 +7177,1024,2048 +7177,2048,128 +7177,2048,512 +7177,2048,1024 +7177,2048,2048 +7177,2048,4096 +7177,2560,2048 +7177,3072,2048 +7177,4608,2048 +7177,6144,2048 +7177,9216,2048 +7177,12288,2048 +7178,1024,2048 +7178,2048,512 +7178,2048,4096 +7178,9216,2048 +7178,12288,2048 +7179,1024,2048 +7179,2048,512 +7179,2048,4096 +7179,9216,2048 +7179,12288,2048 +7183,1024,2048 +7183,2048,512 +7183,2048,4096 +7183,9216,2048 +7183,12288,2048 +7184,1024,2048 +7184,2048,512 +7184,2048,4096 +7184,9216,2048 +7184,12288,2048 +7185,256,2048 +7185,2048,128 +7185,2048,1024 +7185,2560,2048 +7185,3072,2048 +7206,256,2048 +7206,2048,128 +7206,2048,1024 +7206,2560,2048 +7206,3072,2048 +7209,1024,2048 +7209,2048,512 +7209,2048,4096 +7209,9216,2048 +7209,12288,2048 +7210,1024,2048 +7210,2048,512 +7210,2048,4096 +7210,9216,2048 +7210,12288,2048 +7211,1024,2048 +7211,2048,512 +7211,2048,4096 +7211,9216,2048 +7211,12288,2048 +7217,256,2048 +7217,1024,2048 +7217,2048,128 +7217,2048,512 +7217,2048,1024 +7217,2048,2048 +7217,2048,4096 +7217,2560,2048 +7217,3072,2048 +7217,4608,2048 +7217,6144,2048 +7217,9216,2048 +7217,12288,2048 +7223,2048,2048 +7223,4608,2048 +7223,6144,2048 +7238,2048,2048 +7238,4608,2048 +7238,6144,2048 +7246,2048,2048 +7246,4608,2048 +7246,6144,2048 +7249,2048,2048 +7249,4608,2048 +7249,6144,2048 +7257,256,2048 +7257,2048,128 +7257,2048,1024 +7257,2560,2048 +7257,3072,2048 +7265,256,2048 +7265,2048,128 +7265,2048,1024 +7265,2560,2048 +7265,3072,2048 +7273,1024,2048 +7273,2048,512 +7273,2048,4096 +7273,9216,2048 +7273,12288,2048 +7274,1024,2048 +7274,2048,512 +7274,2048,4096 +7274,9216,2048 +7274,12288,2048 +7316,2048,2048 +7316,4608,2048 +7316,6144,2048 +7391,1024,2048 +7391,2048,512 +7391,2048,4096 +7391,9216,2048 +7391,12288,2048 +7393,256,2048 +7393,1024,2048 +7393,2048,128 +7393,2048,512 +7393,2048,1024 +7393,2048,2048 +7393,2048,4096 +7393,2560,2048 +7393,3072,2048 +7393,4608,2048 +7393,6144,2048 +7393,9216,2048 +7393,12288,2048 +7461,256,2048 +7461,2048,128 +7461,2048,1024 +7461,2560,2048 +7461,3072,2048 +7491,2048,2048 +7491,4608,2048 +7491,6144,2048 +7558,2048,2048 +7558,4608,2048 +7558,6144,2048 +7634,256,2048 +7634,2048,128 +7634,2048,1024 +7634,2560,2048 +7634,3072,2048 +7650,2048,2048 +7650,4608,2048 +7650,6144,2048 +7715,1024,2048 +7715,2048,512 +7715,2048,4096 +7715,9216,2048 +7715,12288,2048 +7753,1024,2048 +7753,2048,512 +7753,2048,2048 +7753,2048,4096 +7753,4608,2048 +7753,6144,2048 +7753,9216,2048 +7753,12288,2048 +7774,2048,2048 +7774,4608,2048 +7774,6144,2048 +7815,2048,2048 +7815,4608,2048 +7815,6144,2048 +7819,1024,2048 +7819,2048,512 +7819,2048,4096 +7819,9216,2048 +7819,12288,2048 +7834,2048,2048 +7834,4608,2048 +7834,6144,2048 +7836,2048,2048 +7836,4608,2048 +7836,6144,2048 +7871,2048,2048 +7871,4608,2048 +7871,6144,2048 +7899,2048,2048 +7899,4608,2048 +7899,6144,2048 +7913,2048,2048 +7913,4608,2048 +7913,6144,2048 +7930,2048,2048 +7930,4608,2048 +7930,6144,2048 +7961,2048,2048 +7961,4608,2048 +7961,6144,2048 +7963,2048,2048 +7963,4608,2048 +7963,6144,2048 +7974,2048,2048 +7974,4608,2048 +7974,6144,2048 +8016,2048,2048 +8016,4608,2048 +8016,6144,2048 +8017,2048,2048 +8017,4608,2048 +8017,6144,2048 +8099,1024,2048 +8099,2048,512 +8099,2048,4096 +8099,9216,2048 +8099,12288,2048 +8103,2048,2048 +8103,4608,2048 +8103,6144,2048 +8128,2048,2048 +8128,4608,2048 +8128,6144,2048 +8129,256,2048 +8129,1024,2048 +8129,2048,128 +8129,2048,512 +8129,2048,1024 +8129,2048,2048 +8129,2048,4096 +8129,2560,2048 +8129,3072,2048 +8129,4608,2048 +8129,6144,2048 +8129,9216,2048 +8129,12288,2048 +8150,2048,2048 +8150,4608,2048 +8150,6144,2048 +8184,2048,2048 +8184,4608,2048 +8184,6144,2048 +8185,2048,2048 +8185,4608,2048 +8185,6144,2048 +8188,1024,2048 +8188,2048,512 +8188,2048,4096 +8188,9216,2048 +8188,12288,2048 +8192,256,2048 +8192,1024,2048 +8192,2048,128 +8192,2048,512 +8192,2048,1024 +8192,2048,2048 +8192,2048,4096 +8192,2560,2048 +8192,3072,2048 +8192,4608,2048 +8192,6144,2048 +8192,9216,2048 +8192,12288,2048 diff --git a/aiter/ops/gemm_op_a8w8.py b/aiter/ops/gemm_op_a8w8.py index 1cea93bfeb..f4b9d9b305 100644 --- a/aiter/ops/gemm_op_a8w8.py +++ b/aiter/ops/gemm_op_a8w8.py @@ -216,6 +216,7 @@ def gemm_a8w8_blockscale_ck( x_scale: torch.Tensor, w_scale: torch.Tensor, Out: torch.Tensor, + splitK: int = 0, ) -> torch.Tensor: ... @@ -231,6 +232,7 @@ def gemm_a8w8_blockscale_cktile( w_scale: torch.Tensor, Out: torch.Tensor, isBpreshuffled: bool = False, + splitK: int = 0, ) -> torch.Tensor: ... @@ -685,10 +687,15 @@ def gemm_a8w8_blockscale( ) if config is not None: libtype = config["libtype"] + splitK = int(config.get("splitK", 0)) if libtype == "ck": - return gemm_a8w8_blockscale_ck(XQ, WQ, x_scale, w_scale, Y) + return gemm_a8w8_blockscale_ck( + XQ, WQ, x_scale, w_scale, Y, splitK=splitK + ) elif libtype == "cktile": - return gemm_a8w8_blockscale_cktile(XQ, WQ, x_scale, w_scale, Y) + return gemm_a8w8_blockscale_cktile( + XQ, WQ, x_scale, w_scale, Y, splitK=splitK + ) else: assert 0, f"Unsupported libtype {libtype} for gemm_a8w8_blockscale" try: diff --git a/aiter/ops/triton/_triton_kernels/causal_conv1d_update_single_token.py b/aiter/ops/triton/_triton_kernels/causal_conv1d_update_single_token.py new file mode 100644 index 0000000000..4f2f0356d9 --- /dev/null +++ b/aiter/ops/triton/_triton_kernels/causal_conv1d_update_single_token.py @@ -0,0 +1,507 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2024, Tri Dao. +# Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +# +# Kernels for causal_conv1d **update** single-token paths: ``conv_state`` is updated in place. + +import triton +import triton.language as tl + + +@triton.jit() +def _causal_conv1d_update_single_token_kernel( + # Pointers to matrices + x_ptr, # (batch, dim, seqlen) + w_ptr, # (dim, width) + bias_ptr, + conv_state_ptr, + conv_state_indices_ptr, + block_idx_last_scheduled_token, # (batch,) + initial_state_idx, # (batch,) + o_ptr, # (batch, dim, seqlen) + # Matrix dimensions + batch: int, + dim: tl.constexpr, + seqlen: tl.constexpr, + state_len: tl.constexpr, + num_cache_lines: tl.constexpr, # added to support vLLM larger cache lines + # Strides + stride_x_seq: tl.constexpr, + stride_x_dim: tl.constexpr, + stride_x_token: tl.constexpr, + stride_w_dim: tl.constexpr, + stride_w_width: tl.constexpr, + stride_conv_state_seq: tl.constexpr, + stride_conv_state_dim: tl.constexpr, + stride_conv_state_tok: tl.constexpr, + stride_state_indices: tl.constexpr, + stride_o_seq: tl.constexpr, + stride_o_dim: tl.constexpr, + stride_o_token: tl.constexpr, + # others + pad_slot_id: tl.constexpr, + # Meta-parameters + HAS_BIAS: tl.constexpr, + KERNEL_WIDTH: tl.constexpr, + SILU_ACTIVATION: tl.constexpr, + IS_APC_ENABLED: tl.constexpr, + NP2_STATELEN: tl.constexpr, + USE_PAD_SLOT: tl.constexpr, + BLOCK_N: tl.constexpr, +): + # ruff: noqa: E501 + idx_seq = tl.program_id(0) + if idx_seq >= batch: + return + + # [BLOCK_N,] elements along the feature-dimension (channel) + idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) + + if IS_APC_ENABLED: + # Get the state from the initial_state_idx + conv_state_init = tl.load(initial_state_idx + idx_seq) + current_last_index = tl.load(block_idx_last_scheduled_token + idx_seq) + else: + conv_state_init = 0 + current_last_index = 0 + + # cache_idx + conv_states_input_coord = tl.load( + conv_state_indices_ptr + idx_seq * stride_state_indices + conv_state_init + ).to(tl.int64) + + if USE_PAD_SLOT: # noqa + if conv_states_input_coord == pad_slot_id: + # not processing as this is not the actual sequence + return + + # IS_VARLEN is False + query_start_index = idx_seq * seqlen + query_end_index = query_start_index + seqlen + x_offset = idx_seq * stride_x_seq + o_offset = idx_seq * stride_o_seq + + if query_start_index == query_end_index: + return + + # IS_SPEC_DECODING is False + conv_state_token_offset = 0 + + # STEP 1: READ init_state data + # note: NP2_STATELEN = triton.next_power_of_2(KERNEL_WIDTH - 1) + idx_cols = tl.arange(0, NP2_STATELEN) + conv_state_ptrs_cols = ( + conv_state_ptr + + (conv_states_input_coord * stride_conv_state_seq) + + conv_state_token_offset * stride_conv_state_tok + + (idx_feats * stride_conv_state_dim)[:, None] + + (idx_cols * stride_conv_state_tok)[None, :] + ) # [BLOCK_N, NP2_STATELEN] + mask_cols = ( + (conv_states_input_coord < num_cache_lines) + & (idx_feats < dim)[:, None] + & (idx_cols < KERNEL_WIDTH - 1)[None, :] + ) + cols = tl.load(conv_state_ptrs_cols, mask_cols, other=0.0) + + # STEP 2: assume state_len > seqlen + idx_tokens = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + + # With speculative decoding, the conv_state updates works in a sliding + # window manner, at each forward pass, the tokens are shift by 1, so we + # load since idx_tokens + 1. + conv_state_ptrs_source = ( + conv_state_ptr + + (conv_states_input_coord * stride_conv_state_seq) + + conv_state_token_offset * stride_conv_state_tok + + (idx_feats * stride_conv_state_dim)[None, :] + + ((idx_tokens + seqlen) * stride_conv_state_tok)[:, None] + ) # [BLOCK_M, BLOCK_N] + mask = ( + (conv_states_input_coord < num_cache_lines) + & ((idx_tokens + seqlen) < state_len)[:, None] + & (idx_feats < dim)[None, :] + ) + conv_state = tl.load(conv_state_ptrs_source, mask, other=0.0) + + VAL = state_len - seqlen + x_base = x_ptr + x_offset + (idx_feats * stride_x_dim) # [BLOCK_N] + + x_ptrs = ( + x_base[None, :] + ((idx_tokens - VAL) * stride_x_token)[:, None] + ) # [BLOCK_M, BLOCK_N] + + mask_x = ( + (idx_tokens - VAL >= 0)[:, None] + & (idx_tokens - VAL < seqlen)[:, None] + & (idx_feats < dim)[None, :] + ) # token-index # token-index # feature-index + loaded_x = tl.load(x_ptrs, mask_x, 0.0) + tl.debug_barrier() + + new_conv_state = tl.where(mask, conv_state, loaded_x) + + # Get the state from the initial_state_idx + # cache_idx + conv_states_offset = tl.load( + conv_state_indices_ptr + idx_seq * stride_state_indices + current_last_index + ).to(tl.int64) + conv_state_ptrs_target = ( + conv_state_ptr + + (conv_states_offset * stride_conv_state_seq) # Offset from seq + + (idx_feats * stride_conv_state_dim) + )[ + None, : + ] + ( # [BLOCK_N,] + idx_tokens * stride_conv_state_tok + )[ + :, None + ] + mask = (idx_tokens < state_len)[:, None] & (idx_feats < dim)[None, :] + tl.store(conv_state_ptrs_target, new_conv_state, mask) + + # STEP 3: init accumulator, not necessary + # if HAS_BIAS: + # bias = bias_ptr + idx_feats + # mask_bias = idx_feats < dim + # acc_preload = tl.load(bias, mask=mask_bias, other=0.0).to( + # tl.float32 + # ) # [BLOCK_N] + # else: + # acc_preload = tl.zeros((BLOCK_N,), dtype=tl.float32) + + # STEP 4: + # LOAD WEIGHTS and compute + w_cols_ptrs = ( + w_ptr + + (idx_feats * stride_w_dim)[:, None] + + (idx_cols * stride_w_width)[None, :] + ) + mask_w_cols = (idx_feats < dim)[:, None] & (idx_cols < KERNEL_WIDTH - 1)[None, :] + w_cols = tl.load(w_cols_ptrs, mask_w_cols, other=0.0) # [BLOCK_N, NP2_STATELEN] + + w_last_ptrs = ( + w_ptr + (idx_feats * stride_w_dim) + (KERNEL_WIDTH - 1) * stride_w_width + ) + w_last = tl.load(w_last_ptrs, idx_feats < dim, other=0.0) # [BLOCK_N] + + # For the convolution output: dot(weights, [state_cols | x]) + # cols is [BLOCK_N, NP2_STATELEN] = conv_state history + # We need x as 1D [BLOCK_N] for the last weight column + x_1d = tl.load(x_base, mask=(idx_feats < dim), other=0.0) # [BLOCK_N], reload as 1D + acc = tl.sum((w_cols * cols).to(tl.float32), axis=1) + (w_last * x_1d).to( + tl.float32 + ) + + if HAS_BIAS: + bias = bias_ptr + idx_feats + acc += tl.load(bias, idx_feats < dim, other=0.0).to(tl.float32) # [BLOCK_N] + + if SILU_ACTIVATION: + acc = acc / (1 + tl.exp(-acc)) + mask_1d = idx_feats < dim + o_ptrs = o_ptr + o_offset + (idx_feats * stride_o_dim) + + tl.store(o_ptrs, acc, mask=mask_1d) + + +@triton.jit() +def _reshape_causal_conv1d_update_single_token_kernel( + # Pointers to matrices + x_ptr, # (num_tokens, dim+z_dim, seqlen) where seqlen=1 + ba_ptr, + z_ptr, # (num_tokens, num_v_heads, head_v_dim) + core_attn_out_ptr, # (num_tokens, num_v_heads, head_v_dim) + b_ptr, # (num_accepted_tokens, num_v_heads) + a_ptr, # (num_accepted_tokens, num_v_heads) + w_ptr, # (dim, width) + bias_ptr, + conv_state_ptr, + conv_state_indices_ptr, + block_idx_last_scheduled_token, # (batch,) + initial_state_idx, # (batch,) + o_ptr, # (num_accepted_tokens, dim, seqlen) + # Matrix dimensions + batch: int, + num_tokens: int, + num_k_heads: tl.constexpr, + num_v_heads: tl.constexpr, + head_k_dim: tl.constexpr, + head_v_dim: tl.constexpr, + dim: tl.constexpr, + head_qkvz_dim: tl.constexpr, + seqlen: tl.constexpr, + state_len: tl.constexpr, + num_cache_lines: tl.constexpr, # added to support vLLM larger cache lines + # Strides + stride_x_seq: tl.constexpr, + stride_x_dim: tl.constexpr, + stride_x_token: tl.constexpr, + stride_w_dim: tl.constexpr, + stride_w_width: tl.constexpr, + stride_conv_state_seq: tl.constexpr, + stride_conv_state_dim: tl.constexpr, + stride_conv_state_tok: tl.constexpr, + stride_state_indices: tl.constexpr, + stride_o_seq: tl.constexpr, + stride_o_dim: tl.constexpr, + stride_o_token: tl.constexpr, + stride_z_seq: tl.constexpr, + stride_ba_seq: tl.constexpr, + stride_ba_token: tl.constexpr, + stride_b_seq: tl.constexpr, + # others + pad_slot_id: tl.constexpr, + num_program_write_z: tl.constexpr, + BLOCK_Z: tl.constexpr, + HV: tl.constexpr, + # Meta-parameters + HAS_BIAS: tl.constexpr, + KERNEL_WIDTH: tl.constexpr, + SILU_ACTIVATION: tl.constexpr, + IS_APC_ENABLED: tl.constexpr, + NP2_STATELEN: tl.constexpr, + USE_PAD_SLOT: tl.constexpr, + BLOCK_N: tl.constexpr, +): + # ruff: noqa: E501 + idx_seq = tl.program_id(0) + if idx_seq >= batch: + return + + ## write b, a + if tl.program_id(1) == 0: + ## HV = triton.next_power_of_2(num_v_heads) + idx_hv = tl.arange(0, HV) + ## map idx_hv to source idx + idx_h = idx_hv // (num_v_heads // num_k_heads) + idx_v = idx_hv % (num_v_heads // num_k_heads) + b_source_offset = idx_h * (2 * num_v_heads // num_k_heads) + idx_v + a_source_offset = ( + idx_h * (2 * num_v_heads // num_k_heads) + + num_v_heads // num_k_heads + + idx_v + ) + + b_source_ptrs = ( + ba_ptr + idx_seq * stride_ba_seq + b_source_offset * stride_ba_token + ) + a_source_ptrs = ( + ba_ptr + idx_seq * stride_ba_seq + a_source_offset * stride_ba_token + ) + mask_ba = idx_hv < num_v_heads + b = tl.load(b_source_ptrs, mask=mask_ba, other=0.0) + a = tl.load(a_source_ptrs, mask=mask_ba, other=0.0) + ## b, a should be contiguous so the last stride is 1 + b_ptrs = b_ptr + idx_seq * stride_b_seq + idx_hv + a_ptrs = a_ptr + idx_seq * stride_b_seq + idx_hv + tl.store(b_ptrs, b, mask_ba) + tl.store(a_ptrs, a, mask_ba) + ## write z + elif tl.program_id(1) < 1 + num_program_write_z: + idx_z = (tl.program_id(1) - 1) * BLOCK_Z + tl.arange(0, BLOCK_Z) + ## map idx_z to source idx + idx_z_x = ( + idx_z // (num_v_heads // num_k_heads * head_v_dim) * head_qkvz_dim + + 2 * head_k_dim + + num_v_heads // num_k_heads * head_v_dim + + idx_z % (num_v_heads // num_k_heads * head_v_dim) + ) + z_source_ptrs = x_ptr + idx_seq * stride_x_seq + idx_z_x * stride_x_dim + mask_z = idx_z < num_v_heads * head_v_dim + z = tl.load(z_source_ptrs, mask=mask_z, other=0.0) + z_ptrs = z_ptr + idx_seq * stride_z_seq + idx_z + tl.store(z_ptrs, z, mask=mask_z) + + ## zero-fill core_attn_out + # first, zero_fill [0, batch) for core_attn_out + core_attn_out_ptrs = core_attn_out_ptr + idx_seq * stride_z_seq + idx_z + tl.store(core_attn_out_ptrs, 0.0, mask=mask_z) + # second, zero_fill [batch, num_tokens) for both z and core_attn_out + n_repeat = (num_tokens - 1) // batch + for idx_repeat in tl.range(n_repeat): + idx_seq_remain = batch * (1 + idx_repeat) + idx_seq + z_ptrs = z_ptr + idx_seq_remain * stride_z_seq + idx_z + core_attn_out_ptrs = ( + core_attn_out_ptr + idx_seq_remain * stride_z_seq + idx_z + ) + mask_remain = (idx_seq_remain < num_tokens) & mask_z + tl.store(z_ptrs, 0.0, mask=mask_remain) + tl.store(core_attn_out_ptrs, 0.0, mask=mask_remain) + ## do regular causal conv1d udpate + else: + # [BLOCK_N,] elements along the feature-dimension (channel) + idx_feats = (tl.program_id(1) - 1 - num_program_write_z) * BLOCK_N + tl.arange( + 0, BLOCK_N + ) + ## map idx_feats to idx_feats_x + idx_feats_x = ( + (idx_feats < num_k_heads * head_k_dim).to(tl.int64) + * (idx_feats // head_k_dim * head_qkvz_dim + idx_feats % head_k_dim) + + ( + (idx_feats >= num_k_heads * head_k_dim) + & (idx_feats < num_k_heads * head_k_dim * 2) + ).to(tl.int64) + * ( + (idx_feats - num_k_heads * head_k_dim) // head_k_dim * head_qkvz_dim + + head_k_dim + + (idx_feats - num_k_heads * head_k_dim) % head_k_dim + ) + + (idx_feats >= num_k_heads * head_k_dim * 2).to(tl.int64) + * ( + (idx_feats - num_k_heads * head_k_dim * 2) + // (num_v_heads // num_k_heads * head_v_dim) + * head_qkvz_dim + + 2 * head_k_dim + + (idx_feats - num_k_heads * head_k_dim * 2) + % (num_v_heads // num_k_heads * head_v_dim) + ) + ) + + if IS_APC_ENABLED: + # Get the state from the initial_state_idx + conv_state_init = tl.load(initial_state_idx + idx_seq) + current_last_index = tl.load(block_idx_last_scheduled_token + idx_seq) + else: + conv_state_init = 0 + current_last_index = 0 + + # cache_idx + conv_states_input_coord = tl.load( + conv_state_indices_ptr + idx_seq * stride_state_indices + conv_state_init + ).to(tl.int64) + + if USE_PAD_SLOT: # noqa + if conv_states_input_coord == pad_slot_id: + # not processing as this is not the actual sequence + return + + # IS_VARLEN is False + query_start_index = idx_seq * seqlen + query_end_index = query_start_index + seqlen + x_offset = idx_seq * stride_x_seq + o_offset = idx_seq * stride_o_seq + + if query_start_index == query_end_index: + return + + # STEP 1: READ init_state data + # note: NP2_STATELEN = triton.next_power_of_2(KERNEL_WIDTH - 1) + idx_cols = tl.arange(0, NP2_STATELEN) + conv_state_ptrs_cols = ( + conv_state_ptr + + (conv_states_input_coord * stride_conv_state_seq) + + (idx_feats * stride_conv_state_dim)[:, None] + + (idx_cols * stride_conv_state_tok)[None, :] + ) # [BLOCK_N, NP2_STATELEN] + mask_cols = ( + (conv_states_input_coord < num_cache_lines) + & (idx_feats < dim)[:, None] + & (idx_cols < KERNEL_WIDTH - 1)[None, :] + ) + cols = tl.load(conv_state_ptrs_cols, mask_cols, other=0.0) + + # STEP 2: assume state_len > seqlen + idx_tokens = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + + # With speculative decoding, the conv_state updates works in a sliding + # window manner, at each forward pass, the tokens are shift by 1, so we + # load since idx_tokens + 1. + conv_state_ptrs_source = ( + conv_state_ptr + + (conv_states_input_coord * stride_conv_state_seq) + + (idx_feats * stride_conv_state_dim)[None, :] + + ((idx_tokens + seqlen) * stride_conv_state_tok)[:, None] + ) # [BLOCK_M, BLOCK_N] + mask = ( + (conv_states_input_coord < num_cache_lines) + & ((idx_tokens + seqlen) < state_len)[:, None] + & (idx_feats < dim)[None, :] + ) + conv_state = tl.load(conv_state_ptrs_source, mask, other=0.0) + + VAL = state_len - seqlen + x_base = x_ptr + x_offset + (idx_feats_x * stride_x_dim) # [BLOCK_N] + + x_ptrs = ( + x_base[None, :] + ((idx_tokens - VAL) * stride_x_token)[:, None] + ) # [BLOCK_M, BLOCK_N] + + mask_x = ( + (idx_tokens - VAL >= 0)[:, None] + & (idx_tokens - VAL < seqlen)[:, None] + & (idx_feats < dim)[None, :] + ) # token-index # token-index # feature-index + loaded_x = tl.load(x_ptrs, mask_x, 0.0) + tl.debug_barrier() + + new_conv_state = tl.where(mask, conv_state, loaded_x) + + # Get the state from the initial_state_idx + # cache_idx + conv_states_offset = tl.load( + conv_state_indices_ptr + idx_seq * stride_state_indices + current_last_index + ).to(tl.int64) + conv_state_ptrs_target = ( + conv_state_ptr + + (conv_states_offset * stride_conv_state_seq) # Offset from seq + + (idx_feats * stride_conv_state_dim) + )[ + None, : + ] + ( # [BLOCK_N,] + idx_tokens * stride_conv_state_tok + )[ + :, None + ] + mask = (idx_tokens < state_len)[:, None] & (idx_feats < dim)[None, :] + tl.store(conv_state_ptrs_target, new_conv_state, mask) + + # STEP 3: init accumulator, not necessary + # if HAS_BIAS: + # bias = bias_ptr + idx_feats + # mask_bias = idx_feats < dim + # acc_preload = tl.load(bias, mask=mask_bias, other=0.0).to( + # tl.float32 + # ) # [BLOCK_N] + # else: + # acc_preload = tl.zeros((BLOCK_N,), dtype=tl.float32) + + # STEP 4: + # LOAD WEIGHTS and compute + w_cols_ptrs = ( + w_ptr + + (idx_feats * stride_w_dim)[:, None] + + (idx_cols * stride_w_width)[None, :] + ) + mask_w_cols = (idx_feats < dim)[:, None] & (idx_cols < KERNEL_WIDTH - 1)[ + None, : + ] + w_cols = tl.load(w_cols_ptrs, mask_w_cols, other=0.0) # [BLOCK_N, NP2_STATELEN] + + w_last_ptrs = ( + w_ptr + (idx_feats * stride_w_dim) + (KERNEL_WIDTH - 1) * stride_w_width + ) + w_last = tl.load(w_last_ptrs, idx_feats < dim, other=0.0) # [BLOCK_N] + + # For the convolution output: dot(weights, [state_cols | x]) + # cols is [BLOCK_N, NP2_STATELEN] = conv_state history + # We need x as 1D [BLOCK_N] for the last weight column + x_1d = tl.load( + x_base, mask=(idx_feats < dim), other=0.0 + ) # [BLOCK_N], reload as 1D + acc = tl.sum((w_cols * cols).to(tl.float32), axis=1) + (w_last * x_1d).to( + tl.float32 + ) + + if HAS_BIAS: + bias = bias_ptr + idx_feats + acc += tl.load(bias, idx_feats < dim, other=0.0).to(tl.float32) # [BLOCK_N] + + if SILU_ACTIVATION: + acc = acc / (1 + tl.exp(-acc)) + mask_1d = idx_feats < dim + o_ptrs = o_ptr + o_offset + (idx_feats * stride_o_dim) + + tl.store(o_ptrs, acc, mask=mask_1d) diff --git a/aiter/ops/triton/_triton_kernels/gated_delta_rule/decode/__init__.py b/aiter/ops/triton/_triton_kernels/gated_delta_rule/decode/__init__.py index 89cf712988..1882ff663e 100644 --- a/aiter/ops/triton/_triton_kernels/gated_delta_rule/decode/__init__.py +++ b/aiter/ops/triton/_triton_kernels/gated_delta_rule/decode/__init__.py @@ -8,10 +8,14 @@ This module provides optimized Triton kernels for decode/inference operations. """ +from .fused_rearrange_sigmoid_gdr import ( + fused_rearrange_sigmoid_gated_delta_rule_update_kernel, +) from .fused_recurrent import _fused_recurrent_gated_delta_rule_fwd_kernel from .fused_sigmoid_gating_recurrent import fused_sigmoid_gating_delta_rule_update __all__ = [ "_fused_recurrent_gated_delta_rule_fwd_kernel", + "fused_rearrange_sigmoid_gated_delta_rule_update_kernel", "fused_sigmoid_gating_delta_rule_update", ] diff --git a/aiter/ops/triton/_triton_kernels/gated_delta_rule/decode/fused_rearrange_sigmoid_gdr.py b/aiter/ops/triton/_triton_kernels/gated_delta_rule/decode/fused_rearrange_sigmoid_gdr.py new file mode 100644 index 0000000000..495fb6ad00 --- /dev/null +++ b/aiter/ops/triton/_triton_kernels/gated_delta_rule/decode/fused_rearrange_sigmoid_gdr.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +import triton +import triton.language as tl + + +@triton.heuristics( + { + "USE_INITIAL_STATE": lambda args: args["h0"] is not None, + "IS_VARLEN": lambda args: args["cu_seqlens"] is not None, + "IS_CONTINUOUS_BATCHING": lambda args: args["ssm_state_indices"] is not None, + "IS_SPEC_DECODING": lambda args: args["num_accepted_tokens"] is not None, + } +) +@triton.jit(do_not_specialize=["N", "T"]) +def fused_rearrange_sigmoid_gated_delta_rule_update_kernel( + A_log, + a, + b, + dt_bias, + beta, + threshold, + qkv, + o, + h0, + ht, + cu_seqlens, + ssm_state_indices, + num_accepted_tokens, + scale, + N: tl.int64, # num of sequences + T: tl.int64, # num of tokens + B: tl.constexpr, + H: tl.constexpr, + HV: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + stride_qkv_l: tl.constexpr, + stride_qkv_hd: tl.constexpr, + stride_init_state_token: tl.constexpr, + stride_final_state_token: tl.constexpr, + stride_indices_seq: tl.constexpr, + stride_indices_tok: tl.constexpr, + USE_INITIAL_STATE: tl.constexpr, # whether to use initial state + INPLACE_FINAL_STATE: tl.constexpr, # whether to store final state inplace + USE_QK_L2NORM_IN_KERNEL: tl.constexpr, + IS_VARLEN: tl.constexpr, + IS_CONTINUOUS_BATCHING: tl.constexpr, + IS_SPEC_DECODING: tl.constexpr, + IS_KDA: tl.constexpr, +): + i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_n, i_hv = i_nh // HV, i_nh % HV + i_h = i_hv // (HV // H) + if IS_VARLEN: + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int64), + tl.load(cu_seqlens + i_n + 1).to(tl.int64), + ) + all = T + T = eos - bos + else: + bos, eos = i_n * T, i_n * T + T + all = B * T + + if T == 0: + return + + o_k = i_k * BK + tl.arange(0, BK) + o_v = i_v * BV + tl.arange(0, BV) + + p_q = qkv + bos * stride_qkv_l + ((i_h * K) + o_k) * stride_qkv_hd + p_k = qkv + bos * stride_qkv_l + (H * K + (i_h * K) + o_k) * stride_qkv_hd + p_v = qkv + bos * stride_qkv_l + (2 * H * K + (i_hv * V) + o_v) * stride_qkv_hd + + p_A_log = A_log + i_hv + if not IS_KDA: + p_a = a + bos * HV + i_hv + p_dt_bias = dt_bias + i_hv + else: + p_a = a + (bos * HV + i_hv) * K + o_k + p_dt_bias = dt_bias + i_hv * K + o_k + + p_b = b + bos * HV + i_hv + p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v + + mask_k = o_k < K + mask_v = o_v < V + mask_h = mask_v[:, None] & mask_k[None, :] + + b_h = tl.zeros([BV, BK], dtype=tl.float32) + if USE_INITIAL_STATE: + if IS_CONTINUOUS_BATCHING: + if IS_SPEC_DECODING: + i_t = tl.load(num_accepted_tokens + i_n).to(tl.int64) - 1 + else: + i_t = 0 + state_idx = tl.load( + ssm_state_indices + i_n * stride_indices_seq + i_t * stride_indices_tok + ).to(tl.int64) + if state_idx < 0: + return + p_h0 = h0 + state_idx * stride_init_state_token + else: + p_h0 = h0 + bos * HV * V * K + p_h0 = p_h0 + i_hv * V * K + o_v[:, None] * K + o_k[None, :] + b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32) + + for i_t in range(0, T): + b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32) + b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32) + b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32) + b_b = tl.load(p_b).to(tl.float32) + + x = tl.load(p_a).to(tl.float32) + tl.load(p_dt_bias).to(tl.float32) + softplus_x = tl.where( + beta * x <= threshold, (1 / beta) * tl.log(1 + tl.exp(beta * x)), x + ) + b_g = -tl.exp(tl.load(p_A_log).to(tl.float32)) * softplus_x + + b_beta = tl.sigmoid(b_b.to(tl.float32)) + + if USE_QK_L2NORM_IN_KERNEL: + b_q = b_q * tl.rsqrt(tl.sum(b_q * b_q) + 1e-6) + b_k = b_k * tl.rsqrt(tl.sum(b_k * b_k) + 1e-6) + b_q = b_q * scale + if not IS_KDA: + b_h *= tl.exp(b_g) + else: + b_h *= tl.exp(b_g[None, :]) + b_v -= tl.sum(b_h * b_k[None, :], 1) + b_v *= b_beta + b_h += b_v[:, None] * b_k[None, :] + b_o = tl.sum(b_h * b_q[None, :], 1) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v) + + if INPLACE_FINAL_STATE: + final_state_idx = tl.load( + ssm_state_indices + i_n * stride_indices_seq + i_t * stride_indices_tok + ).to(tl.int64) + if final_state_idx >= 0: + p_ht = ht + final_state_idx * stride_final_state_token + p_ht = p_ht + i_hv * V * K + o_v[:, None] * K + o_k[None, :] + tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h) + else: + p_ht = ht + (bos + i_t) * stride_final_state_token + p_ht = p_ht + i_hv * V * K + o_v[:, None] * K + o_k[None, :] + tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h) + + p_q += stride_qkv_l + p_k += stride_qkv_l + p_v += stride_qkv_l + p_o += HV * V + p_b += HV + p_a += HV diff --git a/aiter/ops/triton/_triton_kernels/quant/fused_fp8_quant.py b/aiter/ops/triton/_triton_kernels/quant/fused_fp8_quant.py index 6f4cb40294..5acc36b81f 100644 --- a/aiter/ops/triton/_triton_kernels/quant/fused_fp8_quant.py +++ b/aiter/ops/triton/_triton_kernels/quant/fused_fp8_quant.py @@ -766,3 +766,127 @@ def _fused_silu_mul_fp8_per_tensor_static_quant_kernel( quant_fp8_out.to(out_fp8_ptr.dtype.element_ty), mask=mask, ) + + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + + +@triton.heuristics( + { + "HAS_BIAS": lambda args: args["B"] is not None, + "HAS_Z": lambda args: args["Z"] is not None, + } +) +@triton.jit +def _fused_rms_gated_fp8_group_quant_kernel( + X, + W, + B, + Z, + Y_quant, + Scales, + stride_x_row, + stride_z_row, + stride_y_row, + stride_s_row, + stride_s_g, + M, + N: tl.constexpr, + eps, + RMS_TILE: tl.constexpr, + ROWS_PER_BLOCK: tl.constexpr, + GROUP_SIZE: tl.constexpr, + NUM_GROUPS: tl.constexpr, + BLOCK_G: tl.constexpr, + HAS_BIAS: tl.constexpr, + HAS_Z: tl.constexpr, + NORM_BEFORE_GATE: tl.constexpr, + FP8_MIN: tl.constexpr, + FP8_MAX: tl.constexpr, + USE_UE8M0: tl.constexpr, + FP8_MIN_SCALING_FACTOR: tl.constexpr, + ACTIVATION: tl.constexpr, +): + row_start = tl.program_id(0) * ROWS_PER_BLOCK + rows = row_start + tl.arange(0, ROWS_PER_BLOCK) + row_mask_1d = rows < M + + # --- Full-row RMS: accumulate sum of squares in float32 --- + sumsq = tl.zeros([ROWS_PER_BLOCK], dtype=tl.float32) + off = 0 + while off < N: + cols = tl.arange(0, RMS_TILE) + off + col_mask = cols < N + mask = row_mask_1d[:, None] & col_mask[None, :] + row_offsets = rows[:, None] * stride_x_row + col_offsets = cols[None, :] + X_base = X + row_offsets + col_offsets + x = tl.load(X_base, mask=mask, other=0.0).to(tl.float32) + if HAS_Z and not NORM_BEFORE_GATE: + Z_base = Z + rows[:, None] * stride_z_row + col_offsets + z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32) + if ACTIVATION == "swish" or ACTIVATION == "silu": + x *= z * tl.sigmoid(z) + elif ACTIVATION == "sigmoid": + x *= tl.sigmoid(z) + xbar = tl.where(mask, x, 0.0) + sumsq += tl.sum(xbar * xbar, axis=1) + off += RMS_TILE + + var = sumsq / N + rstd = tl.rsqrt(var + eps) + + # --- Per-group: normalize (when NORM_BEFORE_GATE), linear, optional gate, FP8 --- + for g in range(NUM_GROUPS): + col0 = g * GROUP_SIZE + cols = tl.arange(0, BLOCK_G) + col0 + col_mask = cols < N + mask = row_mask_1d[:, None] & col_mask[None, :] + row_offsets = rows[:, None] * stride_x_row + col_offsets = cols[None, :] + X_base = X + row_offsets + col_offsets + x = tl.load(X_base, mask=mask, other=0.0).to(tl.float32) + + if HAS_Z and not NORM_BEFORE_GATE: + Z_base = Z + rows[:, None] * stride_z_row + col_offsets + z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32) + if ACTIVATION == "swish" or ACTIVATION == "silu": + x *= z * tl.sigmoid(z) + elif ACTIVATION == "sigmoid": + x *= tl.sigmoid(z) + + x_hat = x * rstd[:, None] + + w_mask = cols < N + w = tl.load(W + cols, mask=w_mask, other=0.0).to(tl.float32) + if HAS_BIAS: + b = tl.load(B + cols, mask=w_mask, other=0.0).to(tl.float32) + y = x_hat * w[None, :] + b[None, :] + else: + y = x_hat * w[None, :] + + if HAS_Z and NORM_BEFORE_GATE: + Z_base = Z + rows[:, None] * stride_z_row + col_offsets + z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32) + if ACTIVATION == "swish" or ACTIVATION == "silu": + y *= z * tl.sigmoid(z) + elif ACTIVATION == "sigmoid": + y *= tl.sigmoid(z) + + abs_y = tl.where(mask, tl.abs(y), 0.0) + absmax = tl.max(abs_y, axis=1) + scales_raw = absmax / FP8_MAX + if USE_UE8M0: + scales_raw = tl.exp2(tl.ceil(tl.log2(scales_raw))) + scales = tl.maximum(scales_raw, FP8_MIN_SCALING_FACTOR) + + y_scaled = y / scales[:, None] + y_quant = tl.maximum(tl.minimum(y_scaled, FP8_MAX), FP8_MIN) + + Y_base = Y_quant + rows[:, None] * stride_y_row + col_offsets + tl.store(Y_base, y_quant.to(Y_quant.dtype.element_ty), mask=mask) + + S_ptr = Scales + rows * stride_s_row + g * stride_s_g + tl.store(S_ptr, scales, mask=row_mask_1d) diff --git a/aiter/ops/triton/causal_conv1d_update_single_token.py b/aiter/ops/triton/causal_conv1d_update_single_token.py new file mode 100644 index 0000000000..07ac393568 --- /dev/null +++ b/aiter/ops/triton/causal_conv1d_update_single_token.py @@ -0,0 +1,330 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2024, Tri Dao. +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +"""Launchers for **causal conv1d update** single-token paths. + +``causal_conv1d_update_single_token`` updates ``conv_state`` **in place** inside the Triton kernel +(the "update" in the name), then writes the convolution output into ``x``/``out`` as in vLLM. +""" + +from __future__ import annotations + +import torch +import triton + +from aiter.ops.triton._triton_kernels.causal_conv1d import PAD_SLOT_ID +from aiter.ops.triton._triton_kernels.causal_conv1d_update_single_token import ( + _causal_conv1d_update_single_token_kernel, + _reshape_causal_conv1d_update_single_token_kernel, +) + + +def _default_conv_state_indices(batch: int, device: torch.device) -> torch.Tensor: + return torch.arange(batch, device=device, dtype=torch.int32) + + +def causal_conv1d_update_single_token( + x: torch.Tensor, + conv_state: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None = None, + activation: bool | str | None = None, + conv_state_indices: torch.Tensor | None = None, + num_accepted_tokens: torch.Tensor | None = None, + query_start_loc: torch.Tensor | None = None, + max_query_len: int = -1, + pad_slot_id: int = PAD_SLOT_ID, + block_idx_last_scheduled_token: torch.Tensor | None = None, + initial_state_idx: torch.Tensor | None = None, + validate_data: bool = False, +) -> torch.Tensor: + assert ( + num_accepted_tokens is None + ), f"num_accepted_tokens must be None, got {num_accepted_tokens}" + assert ( + query_start_loc is None + ), f"query_start_loc must be None, got {query_start_loc}" + if validate_data: + assert pad_slot_id is not None + assert x.stride(1) == 1 + if isinstance(activation, bool): + activation = "silu" if activation is True else None + elif activation is not None: + assert activation in ["silu", "swish"] + + original_x_dtype = x.dtype + x = x.to(conv_state.dtype) + unsqueeze = query_start_loc is None and x.dim() == 2 + if unsqueeze: + x = x.unsqueeze(-1) + if query_start_loc is None: + batch, dim, seqlen = x.shape + else: + assert conv_state_indices is not None + batch = conv_state_indices.size(0) + dim = x.size(1) + seqlen = max_query_len + assert ( + seqlen == 1 + ), f"the single_token version only support seqlen to be 1, got {seqlen}" + _, width = weight.shape + num_cache_lines, _, state_len = conv_state.size() + + if conv_state_indices is None: + conv_state_indices = _default_conv_state_indices(batch, x.device) + + if validate_data: + assert dim == weight.size(0) + assert conv_state.stride(-2) == 1, ( + f"ERROR: expect contiguous along feat-dim of conv_state " + f"(currently stride={conv_state.stride()})" + ) + assert state_len >= width - 1 + assert dim == conv_state.size(1) + assert (batch,) == conv_state_indices.shape + assert num_cache_lines >= batch + assert weight.stride(1) == 1 + + out = x + stride_w_dim, stride_w_width = weight.stride() + + if query_start_loc is None: + stride_x_seq, stride_x_dim, stride_x_token = x.stride() + stride_o_seq, stride_o_dim, stride_o_token = out.stride() + else: + stride_x_token, stride_x_dim = x.stride() + stride_x_seq = 0 + stride_o_token, stride_o_dim = out.stride() + stride_o_seq = 0 + + stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride() + stride_state_indices = conv_state_indices.stride(0) + if num_accepted_tokens is not None: + state_len = width - 1 + (seqlen - 1) + else: + state_len = width - 1 + np2_statelen = triton.next_power_of_2(state_len) + + def grid(META): + return (batch, triton.cdiv(dim, META["BLOCK_N"])) + + _causal_conv1d_update_single_token_kernel[grid]( + x, + weight, + bias, + conv_state, + conv_state_indices, + block_idx_last_scheduled_token, + initial_state_idx, + out, + batch, + dim, + seqlen, + state_len, + num_cache_lines, + stride_x_seq, + stride_x_dim, + stride_x_token, + stride_w_dim, + stride_w_width, + stride_istate_seq, + stride_istate_dim, + stride_istate_token, + stride_state_indices, + stride_o_seq, + stride_o_dim, + stride_o_token, + pad_slot_id, + HAS_BIAS=bias is not None, + KERNEL_WIDTH=width, + SILU_ACTIVATION=activation in ["silu", "swish"], + IS_APC_ENABLED=block_idx_last_scheduled_token is not None, + NP2_STATELEN=np2_statelen, + USE_PAD_SLOT=pad_slot_id is not None, + BLOCK_N=256, + ) + if unsqueeze: + out = out.squeeze(-1) + return out.to(original_x_dtype) + + +def fused_reshape_causal_conv1d_update_single_token( + x: torch.Tensor, + num_actual_tokens: int, + num_k_heads: int, + num_v_heads: int, + head_k_dim: int, + head_v_dim: int, + ba: torch.Tensor, + z_out: torch.Tensor, + core_attn_out: torch.Tensor, + conv_state: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None = None, + activation: bool | str | None = None, + conv_state_indices: torch.Tensor | None = None, + num_accepted_tokens: torch.Tensor | None = None, + query_start_loc: torch.Tensor | None = None, + max_query_len: int = -1, + pad_slot_id: int = PAD_SLOT_ID, + block_idx_last_scheduled_token: torch.Tensor | None = None, + initial_state_idx: torch.Tensor | None = None, + validate_data: bool = False, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + assert ( + num_accepted_tokens is None + ), f"num_accepted_tokens must be None, got {num_accepted_tokens}" + assert ( + query_start_loc is None + ), f"query_start_loc must be None, got {query_start_loc}" + assert z_out.is_contiguous(), "z_out should be contiguous" + assert core_attn_out.is_contiguous(), "core_attn_out should be contiguous" + x = x.view(x.shape[0], -1) + ba = ba.view(ba.shape[0], -1) + assert z_out.size() == core_attn_out.size() + original_z_shape = z_out.shape + num_tokens = z_out.shape[0] + z_out = z_out.view(original_z_shape[0], -1) + core_attn_out = core_attn_out.view(original_z_shape[0], -1) + if validate_data: + assert pad_slot_id is not None + assert x.stride(1) == 1 + if isinstance(activation, bool): + activation = "silu" if activation is True else None + elif activation is not None: + assert activation in ["silu", "swish"] + + original_x_dtype = x.dtype + x = x.to(conv_state.dtype) + unsqueeze = query_start_loc is None and x.dim() == 2 + if unsqueeze: + x = x.unsqueeze(-1) + _, qkvz_dim, seqlen = x.shape + assert ( + seqlen == 1 + ), f"the single_token version only support seqlen to be 1, got {seqlen}" + batch = num_actual_tokens + _, width = weight.shape + head_dim = head_k_dim + head_k_dim + head_v_dim * num_v_heads // num_k_heads + head_qkvz_dim = head_dim + head_v_dim * num_v_heads // num_k_heads + dim = num_k_heads * head_dim + expected_qkvz_dim = num_k_heads * head_qkvz_dim + assert ( + qkvz_dim == expected_qkvz_dim + ), f"ERROR: expect qkvz_dim to be {expected_qkvz_dim}, got {qkvz_dim}" + num_cache_lines, _, state_len = conv_state.size() + + if conv_state_indices is None: + conv_state_indices = _default_conv_state_indices(batch, x.device) + + if validate_data: + assert dim == weight.size(0) + assert conv_state.stride(-2) == 1, ( + f"ERROR: expect contiguous along feat-dim of conv_state " + f"(currently stride={conv_state.stride()})" + ) + assert state_len >= width - 1 + assert dim == conv_state.size(1) + assert (batch,) == conv_state_indices.shape + assert num_cache_lines >= batch + assert weight.stride(1) == 1 + + out = torch.empty((num_actual_tokens, dim, seqlen), dtype=x.dtype, device=x.device) + b_out = torch.empty( + (num_actual_tokens, num_v_heads), dtype=ba.dtype, device=ba.device + ) + a_out = torch.empty( + (num_actual_tokens, num_v_heads), dtype=ba.dtype, device=ba.device + ) + stride_w_dim, stride_w_width = weight.stride() + + if query_start_loc is None: + stride_x_seq, stride_x_dim, stride_x_token = x.stride() + stride_o_seq, stride_o_dim, stride_o_token = out.stride() + else: + stride_x_token, stride_x_dim = x.stride() + stride_x_seq = 0 + stride_o_token, stride_o_dim = out.stride() + stride_o_seq = 0 + + stride_z_seq = z_out.stride(0) + stride_ba_seq, stride_ba_token = ba.stride() + stride_b_seq = b_out.stride(0) + + stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride() + stride_state_indices = conv_state_indices.stride(0) + if num_accepted_tokens is not None: + state_len = width - 1 + (seqlen - 1) + else: + state_len = width - 1 + np2_statelen = triton.next_power_of_2(state_len) + HV = triton.next_power_of_2(num_v_heads) + BLOCK_Z = 512 + num_program_write_z = triton.cdiv(num_v_heads * head_v_dim, BLOCK_Z) + + def grid(META): + return ( + batch, + 1 + num_program_write_z + triton.cdiv(dim, META["BLOCK_N"]), + ) + + _reshape_causal_conv1d_update_single_token_kernel[grid]( + x, + ba, + z_out, + core_attn_out, + b_out, + a_out, + weight, + bias, + conv_state, + conv_state_indices, + block_idx_last_scheduled_token, + initial_state_idx, + out, + batch, + num_tokens, + num_k_heads, + num_v_heads, + head_k_dim, + head_v_dim, + dim, + head_qkvz_dim, + seqlen, + state_len, + num_cache_lines, + stride_x_seq, + stride_x_dim, + stride_x_token, + stride_w_dim, + stride_w_width, + stride_istate_seq, + stride_istate_dim, + stride_istate_token, + stride_state_indices, + stride_o_seq, + stride_o_dim, + stride_o_token, + stride_z_seq, + stride_ba_seq, + stride_ba_token, + stride_b_seq, + pad_slot_id, + num_program_write_z, + BLOCK_Z, + HV=HV, + HAS_BIAS=bias is not None, + KERNEL_WIDTH=width, + SILU_ACTIVATION=activation in ["silu", "swish"], + IS_APC_ENABLED=block_idx_last_scheduled_token is not None, + NP2_STATELEN=np2_statelen, + USE_PAD_SLOT=pad_slot_id is not None, + BLOCK_N=256, + ) + if unsqueeze: + out = out.squeeze(-1) + z_out = z_out.view(original_z_shape) + core_attn_out = core_attn_out.view(original_z_shape) + return out.to(original_x_dtype), b_out, a_out diff --git a/aiter/ops/triton/gated_delta_net/__init__.py b/aiter/ops/triton/gated_delta_net/__init__.py index 066c0d063e..bd201eab45 100644 --- a/aiter/ops/triton/gated_delta_net/__init__.py +++ b/aiter/ops/triton/gated_delta_net/__init__.py @@ -8,6 +8,7 @@ This module provides high-level Triton implementations for gated delta rule. """ +from .fused_rearrange_sigmoid_gdr import fused_rearrange_sigmoid_gated_delta_rule from .gated_delta_rule import ( chunk_gated_delta_rule, chunk_gated_delta_rule_opt, @@ -16,8 +17,9 @@ ) __all__ = [ - "fused_recurrent_gated_delta_rule", "chunk_gated_delta_rule", + "fused_rearrange_sigmoid_gated_delta_rule", + "fused_recurrent_gated_delta_rule", "chunk_gated_delta_rule_opt", "chunk_gated_delta_rule_opt_vk", ] diff --git a/aiter/ops/triton/gated_delta_net/fused_rearrange_sigmoid_gdr.py b/aiter/ops/triton/gated_delta_net/fused_rearrange_sigmoid_gdr.py new file mode 100644 index 0000000000..8dc22a6669 --- /dev/null +++ b/aiter/ops/triton/gated_delta_net/fused_rearrange_sigmoid_gdr.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +# +# Adapted from flash-linear-attention / vLLM (see _triton_kernels copy). + +from __future__ import annotations + +import torch +import triton + +from aiter.ops.triton._triton_kernels.gated_delta_rule.decode.fused_rearrange_sigmoid_gdr import ( + fused_rearrange_sigmoid_gated_delta_rule_update_kernel, +) + + +def fused_rearrange_sigmoid_gated_delta_rule( + A_log: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + dt_bias: torch.Tensor, + qkv: torch.Tensor, + key_dim: int, + value_dim: int, + head_k_dim: int, + head_v_dim: int, + beta: float = 1.0, + threshold: float = 20.0, + scale: float | None = None, + initial_state: torch.Tensor | None = None, + inplace_final_state: bool = True, + cu_seqlens: torch.LongTensor | None = None, + ssm_state_indices: torch.Tensor | None = None, + num_accepted_tokens: torch.Tensor | None = None, + use_qk_l2norm_in_kernel: bool = False, + is_kda: bool = False, + core_attn_out: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Fused Triton sigmoid-gated delta rule over packed QKV (decode-oriented). + """ + expected_shape = (qkv.shape[0], key_dim * 2 + value_dim) + assert ( + qkv.shape == expected_shape + ), f"expect qkv to be in shape {expected_shape}, got {qkv.shape}" + if scale is None: + scale = head_k_dim**-0.5 + else: + assert scale > 0, "scale must be positive" + + B = 1 + T = qkv.shape[0] + H = key_dim // head_k_dim + HV = value_dim // head_v_dim + K = head_k_dim + V = head_v_dim + N = B if cu_seqlens is None else len(cu_seqlens) - 1 + + BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 32) + NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV) + assert NK == 1, "NK > 1 is not supported yet" + num_stages = 3 + num_warps = 4 + + if inplace_final_state and ssm_state_indices is None: + raise ValueError( + "ssm_state_indices is required when inplace_final_state=True " + "(kernel indexes final state slots per token)." + ) + + o = ( + core_attn_out[: NK * B * T * HV * V].view(NK, B, T, HV, V) + if core_attn_out is not None + else qkv.new_empty(NK, B, T, HV, V) + ) + if inplace_final_state: + if initial_state is None: + raise ValueError("initial_state is required when inplace_final_state=True") + final_state = initial_state + else: + st_dtype = initial_state.dtype if initial_state is not None else qkv.dtype + final_state = qkv.new_empty(T, HV, V, K, dtype=st_dtype) + + stride_init_state_token = ( + int(initial_state.stride(0)) if initial_state is not None else 0 + ) + stride_final_state_token = int(final_state.stride(0)) + + if ssm_state_indices is None: + stride_indices_seq, stride_indices_tok = 1, 1 + elif ssm_state_indices.ndim == 1: + stride_indices_seq, stride_indices_tok = ssm_state_indices.stride(0), 1 + else: + stride_indices_seq, stride_indices_tok = ssm_state_indices.stride() + + stride_qkv_l, stride_qkv_hd = qkv.stride() + + grid = (NK, NV, N * HV) + fused_rearrange_sigmoid_gated_delta_rule_update_kernel[grid]( + A_log=A_log, + a=a.contiguous(), + b=b.contiguous(), + dt_bias=dt_bias, + beta=beta, + threshold=threshold, + qkv=qkv, + o=o, + h0=initial_state, + ht=final_state, + cu_seqlens=cu_seqlens, + ssm_state_indices=ssm_state_indices, + num_accepted_tokens=num_accepted_tokens, + scale=scale, + N=N, + T=T, + B=B, + H=H, + HV=HV, + K=K, + V=V, + BK=BK, + BV=BV, + stride_qkv_l=stride_qkv_l, + stride_qkv_hd=stride_qkv_hd, + stride_init_state_token=stride_init_state_token, + stride_final_state_token=stride_final_state_token, + stride_indices_seq=stride_indices_seq, + stride_indices_tok=stride_indices_tok, + INPLACE_FINAL_STATE=inplace_final_state, + USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel, + IS_KDA=is_kda, + num_warps=num_warps, + num_stages=num_stages, + ) + o = o.squeeze(0) + return o, final_state diff --git a/aiter/ops/triton/gluon/pa_mqa_logits.py b/aiter/ops/triton/gluon/pa_mqa_logits.py index a1421a6f70..5898b90cce 100644 --- a/aiter/ops/triton/gluon/pa_mqa_logits.py +++ b/aiter/ops/triton/gluon/pa_mqa_logits.py @@ -735,6 +735,14 @@ def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle( 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) ) ), + mask=( + context_idx + + ChunkK + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ) + < max_model_len, ) context_idx = split_context_start + split_context_length - ChunkK @@ -925,6 +933,11 @@ def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle( context_idx + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) ), + mask=( + context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ) + < max_model_len, ) for context_idx_ in range( @@ -1000,6 +1013,14 @@ def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle( 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) ) ), + mask=( + context_idx_ + + ChunkKPerStage + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ) + < max_model_len, ) # ======================================================================================= @@ -1074,6 +1095,14 @@ def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle( 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) ) ), + mask=( + context_idx_ + + ChunkK + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ) + < max_model_len, ) context_idx = context_idx_ + ChunkK @@ -1107,6 +1136,12 @@ def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle( + ChunkKPerStage + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) ), + mask=( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ) + < max_model_len, ) @@ -1547,6 +1582,14 @@ def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle_varctx( 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) ) ), + mask=( + context_idx + + ChunkK + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ) + < max_model_len, ) context_idx = split_context_start + split_context_length - ChunkK @@ -1737,6 +1780,11 @@ def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle_varctx( context_idx + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) ), + mask=( + context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ) + < max_model_len, ) for context_idx_ in range( @@ -1812,6 +1860,14 @@ def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle_varctx( 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) ) ), + mask=( + context_idx_ + + ChunkKPerStage + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ) + < max_model_len, ) # ======================================================================================= @@ -1886,6 +1942,14 @@ def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle_varctx( 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) ) ), + mask=( + context_idx_ + + ChunkK + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ) + < max_model_len, ) context_idx = context_idx_ + ChunkK @@ -1919,4 +1983,10 @@ def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle_varctx( + ChunkKPerStage + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) ), + mask=( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ) + < max_model_len, ) diff --git a/aiter/ops/triton/quant/__init__.py b/aiter/ops/triton/quant/__init__.py index 144a0247dd..5a1eec8cac 100644 --- a/aiter/ops/triton/quant/__init__.py +++ b/aiter/ops/triton/quant/__init__.py @@ -7,11 +7,14 @@ ) from .fused_fp8_quant import ( + calc_rows_per_block, fused_rms_fp8_per_tensor_static_quant, fused_rms_fp8_group_quant, + fused_rms_gated_fp8_group_quant, fused_flatten_fp8_group_quant, fused_reduce_act_mul_fp8_group_quant, fused_reduce_rms_fp8_group_quant, + get_fp8_min_max_bounds, ) from .fused_mxfp4_quant import ( @@ -30,8 +33,11 @@ "dynamic_mxfp4_quant", "_mxfp4_quant_op", # fused_fp8_quant.py exports + "calc_rows_per_block", + "get_fp8_min_max_bounds", "fused_rms_fp8_per_tensor_static_quant", "fused_rms_fp8_group_quant", + "fused_rms_gated_fp8_group_quant", "fused_flatten_fp8_group_quant", "fused_reduce_act_mul_fp8_group_quant", "fused_reduce_rms_fp8_group_quant", diff --git a/aiter/ops/triton/quant/fused_fp8_quant.py b/aiter/ops/triton/quant/fused_fp8_quant.py index 7575310770..f0583a86b3 100644 --- a/aiter/ops/triton/quant/fused_fp8_quant.py +++ b/aiter/ops/triton/quant/fused_fp8_quant.py @@ -1,3 +1,4 @@ +from functools import cache from typing import Optional import torch import triton @@ -5,11 +6,13 @@ from aiter.ops.triton._triton_kernels.quant.fused_fp8_quant import ( _fused_rms_fp8_per_tensor_static_quant_kernel, _fused_rms_fp8_group_quant_kernel, + _fused_rms_gated_fp8_group_quant_kernel, _fused_flatten_fp8_group_quant_kernel, _fused_reduce_act_mul_fp8_group_quant, _fused_reduce_rms_fp8_group_quant_kernel, _fused_silu_mul_fp8_per_tensor_static_quant_kernel, ) +from aiter.ops.triton.utils.types import get_fp8_e4m3_dtype from aiter.ops.triton._triton_kernels.activation import ( _get_activation_from_str, ) @@ -329,6 +332,167 @@ def fused_rms_fp8_group_quant( return (out1_fp8, out1_bs), out1, out2, out_res1 +def get_fp8_min_max_bounds(fp8_dtype: torch.dtype) -> tuple[float, float]: + """Match vLLM ``quant_utils.get_fp8_min_max`` for ``fp8_dtype`` (incl. ROCm fnuz ±224).""" + if fp8_dtype == torch.float8_e4m3fnuz: + return -224.0, 224.0 + finfo = torch.finfo(fp8_dtype) + return float(finfo.min), float(finfo.max) + + +@cache +def _num_compute_units(device_id: int = 0) -> int: + """Match vLLM ``vllm.utils.platform_utils.num_compute_units`` (``current_platform.num_compute_units``).""" + return torch.cuda.get_device_properties(device_id).multi_processor_count + + +def calc_rows_per_block(M: int, device: torch.device) -> int: + """Same heuristic as vLLM ``input_quant_fp8.calc_rows_per_block``.""" + if device.type != "cuda": + raise ValueError( + "fused_rms_gated_fp8_group_quant targets AMD ROCm (HIP); expected a CUDA/HIP device." + ) + device_id = ( + device.index if device.index is not None else torch.cuda.current_device() + ) + sm_count = max(int(_num_compute_units(device_id)), 1) + rows_per_block = triton.next_power_of_2(triton.cdiv(M, 2 * sm_count)) + return min(int(rows_per_block), 4) + + +def fused_rms_gated_fp8_group_quant( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None, + z: torch.Tensor, + eps: float, + *, + norm_before_gate: bool = True, + use_ue8m0: bool = False, + activation: str = "silu", + out_dtype: torch.dtype | None = None, + fp8_min: float | None = None, + fp8_max: float | None = None, + fp8_min_scaling_factor: float | None = None, + group_size: int | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Fused RMSNorm (with optional bias), optional multiplicative gate from ``z``, + and FP8 quantization (same contract as vLLM ``_rmsnorm_quantize_group_native`` for + ``group_size == N``). + + Comparison with ``fused_rms_fp8_group_quant``: + Use ``fused_rms_fp8_group_quant`` when you need optional **two-stream** RMSNorm + (``inp1`` / optional ``inp2`` with separate weights and epsilons), optional + **residual** fused into ``inp1`` (``res1``), FP8 group quantization on the **first** + normalized stream only, the richer return tuple (quantized FP8, block scales, + optional unquantized ``inp1``, second RMS output, residual output), and optional + ``transpose_scale`` layout for scales. + + Use **this** function for **single** hidden ``x``, one RMS **weight** (and optional + **bias**), plus ``z`` for **elementwise multiplicative gating** (SiLU / sigmoid-style + activations on ``z``) matching ``x``'s shape; optional ``norm_before_gate`` ordering; + vLLM-aligned FP8 bounds / optional UE8M0 / ``group_size`` (``None`` = one scale per + row, else per-column-group scales). Returns only ``(x_quant_fp8, scales)``. Suited to + gated RMSNorm input quantization (e.g. SwiGLU-style / vLLM + ``_rmsnorm_quantize_group_native`` contracts), not the two-stream + residual pattern + above. + + ``x`` and ``z`` must be 2D contiguous with identical shape ``(M, N)``. + Returns ``(x_quant_fp8, scales)`` where ``scales`` is ``(M,)`` float32 if + ``group_size`` is ``None`` (one scale per row), or ``(M, N // group_size)`` float32 + when ``group_size`` divides ``N`` (one scale per row per column group). + + ``fp8_min`` / ``fp8_max`` / ``fp8_min_scaling_factor`` default from ``out_dtype`` (or + ``get_fp8_e4m3_dtype()``) using the same rules as vLLM ``get_fp8_min_max`` and + ``1.0 / (_FP8_MAX * 512)``. Pass them explicitly when you want to pin values (e.g. from + vLLM's ``get_fp8_min_max()`` at model init). + + Raises: + ValueError: if ``group_size`` is not ``None`` and ``group_size > N``, + ``group_size <= 0``, or ``N`` is not divisible by ``group_size``. + """ + assert x.is_contiguous() and z.is_contiguous() + assert x.shape == z.shape, "x and z must have the same shape" + fp8_dtype = out_dtype if out_dtype is not None else get_fp8_e4m3_dtype() + if (fp8_min is None) ^ (fp8_max is None): + raise ValueError("fp8_min and fp8_max must be passed together or both omitted.") + if fp8_min is None: + fp8_min, fp8_max = get_fp8_min_max_bounds(fp8_dtype) + if fp8_min_scaling_factor is None: + fp8_min_scaling_factor = 1.0 / (fp8_max * 512.0) + + weight = weight.contiguous() + if bias is not None: + bias = bias.contiguous() + + M, N = x.shape + if group_size is not None: + if group_size <= 0: + raise ValueError(f"group_size must be positive, got {group_size}") + if group_size > N: + raise ValueError( + f"group_size ({group_size}) must be less than or equal to hidden size " + f"N ({N}); per-column FP8 groups cannot exceed the row width." + ) + if N % group_size != 0: + raise ValueError( + f"hidden size N ({N}) must be divisible by group_size ({group_size})." + ) + + effective_gs = N if group_size is None else int(group_size) + num_groups = N // effective_gs + + MAX_FUSED_SIZE = 65536 // x.element_size() + if N > MAX_FUSED_SIZE: + raise RuntimeError("This RMSNorm quant kernel does not support N >= 64KB.") + + rms_tile = min(512, triton.next_power_of_2(N)) + block_g = triton.next_power_of_2(effective_gs) + rows_per_block = calc_rows_per_block(M, x.device) + num_warps = min(max(block_g // 256, 1), 8) + + x_quant = torch.empty(M, N, dtype=fp8_dtype, device=x.device) + if group_size is None: + scales = torch.empty(M, dtype=torch.float32, device=x.device) + stride_s_row = int(scales.stride(0)) + stride_s_g = 0 + else: + scales = torch.empty(M, num_groups, dtype=torch.float32, device=x.device) + stride_s_row, stride_s_g = (int(scales.stride(0)), int(scales.stride(1))) + + grid = (triton.cdiv(M, rows_per_block),) + _fused_rms_gated_fp8_group_quant_kernel[grid]( + x, + weight, + bias, + z, + x_quant, + scales, + x.stride(0), + z.stride(0), + x_quant.stride(0), + stride_s_row, + stride_s_g, + M, + N, + eps, + RMS_TILE=rms_tile, + ROWS_PER_BLOCK=rows_per_block, + GROUP_SIZE=effective_gs, + NUM_GROUPS=num_groups, + BLOCK_G=block_g, + NORM_BEFORE_GATE=norm_before_gate, + FP8_MIN=fp8_min, + FP8_MAX=fp8_max, + USE_UE8M0=use_ue8m0, + FP8_MIN_SCALING_FACTOR=fp8_min_scaling_factor, + num_warps=num_warps, + ACTIVATION=activation, + ) + return x_quant, scales + + def fused_flatten_fp8_group_quant( x: torch.Tensor, group_size, diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale.cu b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale.cu index 6d99612be2..5449d5b7ee 100644 --- a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale.cu +++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale.cu @@ -1,7 +1,6 @@ // SPDX-License-Identifier: MIT // Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. -#include #include #include @@ -15,7 +14,7 @@ #include "gemm_a8w8_blockscale_manifest.h" using BlockwiseKernel = std::function; + torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&, int)>; using BlockwiseKernelMap = GemmDispatchMap; @@ -83,22 +82,28 @@ torch::Tensor gemm_a8w8_blockscale(torch::Tensor& XQ, torch::Tensor& WQ, torch::Tensor& x_scale, torch::Tensor& w_scale, - torch::Tensor& Y) + torch::Tensor& Y, + int splitK) { TORCH_CHECK(XQ.dtype() == WQ.dtype(), "Weights and activations should have the same dtype!"); TORCH_CHECK(x_scale.dtype() == w_scale.dtype(), "Scales should have the same dtype!"); - int M = XQ.size(0); - int N = WQ.size(0); - int K = XQ.size(1); + TORCH_CHECK(splitK >= 0 && splitK <= 30, + "splitK must be in the range [0, 30], got ", + splitK); + + int M = XQ.size(0); + int N = WQ.size(0); + int K = XQ.size(1); + int KBatch = 1 << splitK; if(x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::Half) { - blockscale_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y); + blockscale_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y, KBatch); } else if(x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::BFloat16) { - blockscale_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y); + blockscale_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y, KBatch); } else { diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile.cu b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile.cu index d5cdf0d239..b6a9f3ca73 100644 --- a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile.cu +++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile.cu @@ -1,7 +1,6 @@ // SPDX-License-Identifier: MIT // Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. -#include #include #include @@ -15,7 +14,7 @@ #include "gemm_a8w8_blockscale_cktile_manifest.h" using BlockwiseKernel = std::function; + torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&, bool, int)>; using BlockwiseKernelMap = GemmDispatchMap; @@ -83,24 +82,30 @@ torch::Tensor gemm_a8w8_blockscale_cktile(torch::Tensor& XQ, torch::Tensor& x_scale, torch::Tensor& w_scale, torch::Tensor& Y, - bool preshuffleB) + bool preshuffleB, + int splitK) { TORCH_CHECK(XQ.dtype() == WQ.dtype(), "Weights and activations should have the same dtype!"); TORCH_CHECK(x_scale.dtype() == w_scale.dtype(), "Scales should have the same dtype!"); - int M = XQ.size(0); - int N = WQ.size(0); - int K = XQ.size(1); + TORCH_CHECK(splitK >= 0 && splitK <= 30, + "splitK must be in the range [0, 30], got ", + splitK); + + int M = XQ.size(0); + int N = WQ.size(0); + int K = XQ.size(1); + int KBatch = 1 << splitK; if(x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::Half) { blockscale_dispatch(M, N, K)( - XQ, WQ, x_scale, w_scale, Y, preshuffleB); + XQ, WQ, x_scale, w_scale, Y, preshuffleB, KBatch); } else if(x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::BFloat16) { blockscale_dispatch(M, N, K)( - XQ, WQ, x_scale, w_scale, Y, preshuffleB); + XQ, WQ, x_scale, w_scale, Y, preshuffleB, KBatch); } else { @@ -116,5 +121,5 @@ torch::Tensor gemm_a8w8_blockscale_bpreshuffle_cktile(torch::Tensor& XQ, torch::Tensor& Y, bool preshuffleB) { - return gemm_a8w8_blockscale_cktile(XQ, WQ, x_scale, w_scale, Y, preshuffleB); + return gemm_a8w8_blockscale_cktile(XQ, WQ, x_scale, w_scale, Y, preshuffleB, 0); } diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile_tune.cu b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile_tune.cu index b1c9077d82..48e183809c 100644 --- a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile_tune.cu +++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_cktile_tune.cu @@ -12,7 +12,7 @@ #include "gemm_a8w8_blockscale_cktile_manifest.h" using BlockwiseKernel = std::function; + torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&, bool, int)>; // For certain high priority shapes, we directly use the best kernel rather // than use heuristics. @@ -73,12 +73,12 @@ torch::Tensor gemm_a8w8_blockscale_cktile_tune(torch::Tensor& XQ, if(Y.dtype() == at::ScalarType::BFloat16) { blockwise_dispatch_cktile(kernelId)( - XQ, WQ, x_scale, w_scale, Y, preshuffleB); + XQ, WQ, x_scale, w_scale, Y, preshuffleB, KBatch); } else if(Y.dtype() == at::ScalarType::Half) { blockwise_dispatch_cktile(kernelId)( - XQ, WQ, x_scale, w_scale, Y, preshuffleB); + XQ, WQ, x_scale, w_scale, Y, preshuffleB, KBatch); } else { diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.cu b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.cu index c6620ab89b..0fea1c4a46 100644 --- a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.cu +++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.cu @@ -12,7 +12,7 @@ #include "gemm_a8w8_blockscale_manifest.h" using BlockwiseKernel = std::function; + torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&, int)>; // For certain high priority shapes, we directly use the best kernel rather // than use heuristics. @@ -71,11 +71,11 @@ torch::Tensor gemm_a8w8_blockscale_tune(torch::Tensor& XQ, if(Y.dtype() == at::ScalarType::BFloat16) { - blockwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y); + blockwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y, KBatch); } else if(Y.dtype() == at::ScalarType::Half) { - blockwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y); + blockwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y, KBatch); } else { diff --git a/csrc/ck_gemm_a8w8_blockscale/gen_instances.py b/csrc/ck_gemm_a8w8_blockscale/gen_instances.py index c538659f96..fd6f799b3b 100644 --- a/csrc/ck_gemm_a8w8_blockscale/gen_instances.py +++ b/csrc/ck_gemm_a8w8_blockscale/gen_instances.py @@ -115,7 +115,8 @@ def gen_ck_instance(self, k: KernelInstance): torch::Tensor &WQ, torch::Tensor &x_scale, torch::Tensor &w_scale, - torch::Tensor &Y + torch::Tensor &Y, + int KBatch ) {{ // Get M, N, K from input tensors. @@ -186,7 +187,7 @@ def gen_ck_instance(self, k: KernelInstance): ck::tensor_operation::device::GemmSpecialization::{{GemmSpec}}>; // Run kernel instance. - return gemm_a8w8_blockscale_impl(XQ, WQ, x_scale, w_scale, Y); + return gemm_a8w8_blockscale_impl(XQ, WQ, x_scale, w_scale, Y, KBatch); """ INSTANCE_IMPL_str = ( LEGACY_INSTANCE_IMPL.replace( @@ -238,7 +239,8 @@ def gen_ck_instance(self, k: KernelInstance): torch::Tensor &WQ, torch::Tensor &x_scale, torch::Tensor &w_scale, - torch::Tensor &Y + torch::Tensor &Y, + int KBatch ); """ @@ -312,7 +314,8 @@ def gen_manifest_head(self, kernels_dict): torch::Tensor &WQ, torch::Tensor &x_scale, torch::Tensor &w_scale, - torch::Tensor &Y); + torch::Tensor &Y, + int KBatch); """ MAINFEST_end = """ diff --git a/csrc/ck_gemm_a8w8_blockscale/gen_instances_cktile.py b/csrc/ck_gemm_a8w8_blockscale/gen_instances_cktile.py index 4cd9bc02a9..2326dea99e 100644 --- a/csrc/ck_gemm_a8w8_blockscale/gen_instances_cktile.py +++ b/csrc/ck_gemm_a8w8_blockscale/gen_instances_cktile.py @@ -74,7 +74,8 @@ def gen_cktile_instance(self, k: TileKernelInstance): torch::Tensor &x_scale, torch::Tensor &w_scale, torch::Tensor &Y, - bool preshuffleB + bool preshuffleB, + int k_batch ) {{ // Get M, N, K from input tensors. @@ -100,7 +101,7 @@ def gen_cktile_instance(self, k: TileKernelInstance): {str(k.AQRowMajor).lower()}>; // Run kernel instance. - return gemm_a8w8_blockscale_cktile_impl(XQ, WQ, x_scale, w_scale, Y, preshuffleB); + return gemm_a8w8_blockscale_cktile_impl(XQ, WQ, x_scale, w_scale, Y, preshuffleB, k_batch); """ TILE_INSTANCE_IMPL_str = TILE_INSTANCE_IMPL.replace( @@ -123,7 +124,8 @@ def gen_cktile_instance(self, k: TileKernelInstance): torch::Tensor &x_scale, torch::Tensor &w_scale, torch::Tensor &Y, - bool preshuffleB + bool preshuffleB, + int k_batch ); """ @@ -198,7 +200,8 @@ def gen_manifest_head(self, kernels_dict): torch::Tensor &x_scale, torch::Tensor &w_scale, torch::Tensor &Y, - bool preshuffleB); + bool preshuffleB, + int k_batch); """ MAINFEST_end = """ diff --git a/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale.h b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale.h index fa909c12f5..2aa90db358 100644 --- a/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale.h +++ b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale.h @@ -9,7 +9,8 @@ torch::Tensor gemm_a8w8_blockscale(torch::Tensor& XQ, torch::Tensor& WQ, torch::Tensor& x_scale, torch::Tensor& w_scale, - torch::Tensor& Y); + torch::Tensor& Y, + int splitK = 0); torch::Tensor gemm_a8w8_blockscale_tune(torch::Tensor& XQ, torch::Tensor& WQ, diff --git a/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_cktile.h b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_cktile.h index 84455c88c6..a5e8475945 100644 --- a/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_cktile.h +++ b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_cktile.h @@ -10,7 +10,8 @@ torch::Tensor gemm_a8w8_blockscale_cktile(torch::Tensor& XQ, torch::Tensor& x_scale, torch::Tensor& w_scale, torch::Tensor& Y, - bool preshuffleB); + bool preshuffleB, + int splitK = 0); torch::Tensor gemm_a8w8_blockscale_cktile_tune(torch::Tensor& XQ, torch::Tensor& WQ, diff --git a/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_cktile_common.cuh b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_cktile_common.cuh index 773c08311a..595a810056 100644 --- a/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_cktile_common.cuh +++ b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_cktile_common.cuh @@ -234,11 +234,6 @@ void TileGemmComputeImpl(ck_tile::QuantGemmHostArgs& args) const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); const dim3 blocks = Kernel::BlockSize(); - if(args.k_batch != 1) - { - throw std::runtime_error("split-k is not supported yet!"); - } - if(!Kernel::IsSupportedArgument(kargs)) { throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); @@ -283,7 +278,8 @@ __forceinline__ torch::Tensor gemm_a8w8_blockscale_cktile_impl(torch::Tensor& XQ torch::Tensor& x_scale, torch::Tensor& w_scale, torch::Tensor& Y, - bool PreshuffleB) + bool PreshuffleB, + int k_batch = 1) { // check TORCH_CHECK(XQ.dtype() == WQ.dtype(), "Weights and activations should have the same dtype!"); @@ -372,8 +368,7 @@ __forceinline__ torch::Tensor gemm_a8w8_blockscale_cktile_impl(torch::Tensor& XQ args.bq_ptr = w_scale.data_ptr(); args.c_ptr = Y.data_ptr(); - // split-k is not supported yet for tile quant gemm, set k_batch to 1 - args.k_batch = 1; + args.k_batch = k_batch; args.M = M; args.N = N; args.K = K; @@ -400,6 +395,14 @@ __forceinline__ torch::Tensor gemm_a8w8_blockscale_cktile_impl(torch::Tensor& XQ args.stride_AQ = stride_AQ; args.stride_BQ = stride_BQ; + // Split-K uses atomic_add into C; zero the output buffer first. + // Use zero_() so all rows are cleared regardless of the leading-dimension + // stride (e.g. padded tensors produced by vLLM's _maybe_pad_fp8_weight). + if(k_batch > 1) + { + Y.zero_(); + } + // do tile GEMM if(PreshuffleB) { diff --git a/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_common.cuh b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_common.cuh index 84919bd65d..d7f0a43d18 100644 --- a/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_common.cuh +++ b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_common.cuh @@ -124,7 +124,8 @@ __forceinline__ torch::Tensor gemm_a8w8_blockscale_impl(torch::Tensor& XQ, torch::Tensor& WQ, torch::Tensor& x_scale, torch::Tensor& w_scale, - torch::Tensor& Y) + torch::Tensor& Y, + int KBatch = 1) { int M = XQ.size(0); int N = WQ.size(0); @@ -160,6 +161,13 @@ __forceinline__ torch::Tensor gemm_a8w8_blockscale_impl(torch::Tensor& XQ, b_element_op, cde_element_op); + TORCH_CHECK(KBatch >= 1, "KBatch must be >= 1, got ", KBatch); + + if(KBatch > 1) + { + device_gemm.SetKBatch(&argument, KBatch); + } + TORCH_CHECK(device_gemm.IsSupportedArgument(argument), "This GEMM is not supported!"); invoker.Run(argument, StreamConfig{at::hip::getCurrentHIPStream()}); diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index d2a2bd654f..08b06441cd 100644 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -497,7 +497,8 @@ namespace py = pybind11; py::arg("WQ"), \ py::arg("x_scale"), \ py::arg("w_scale"), \ - py::arg("Out")); + py::arg("Out"), \ + py::arg("splitK") = 0); #define GEMM_A8W8_BLOCKSCALE_TUNE_PYBIND \ m.def("gemm_a8w8_blockscale_tune", \ @@ -520,7 +521,8 @@ namespace py = pybind11; py::arg("x_scale"), \ py::arg("w_scale"), \ py::arg("Out"), \ - py::arg("preshuffleB") = false); + py::arg("preshuffleB") = false, \ + py::arg("splitK") = 0); #define GEMM_A8W8_BLOCKSCALE_CKTILE_TUNE_PYBIND \ m.def("gemm_a8w8_blockscale_cktile_tune", \ diff --git a/op_tests/test_gemm_a8w8_blockscale.py b/op_tests/test_gemm_a8w8_blockscale.py index 54d6fd02b2..4899e3aa1a 100755 --- a/op_tests/test_gemm_a8w8_blockscale.py +++ b/op_tests/test_gemm_a8w8_blockscale.py @@ -13,6 +13,7 @@ import torch import torch.nn.functional as F from aiter import dtypes +from aiter.ops.gemm_op_a8w8 import gemm_a8w8_blockscale_ck, gemm_a8w8_blockscale_cktile from aiter.ops.shuffle import shuffle_weight from aiter.test_common import benchmark, checkAllclose, perftest from einops import rearrange @@ -127,6 +128,54 @@ def run_asm(x, weight, x_scale, w_scale, dtype=dtypes.bf16, kernel_name=None): return aiter.gemm_a8w8_blockscale_bpreshuffle_asm(x, weight, out, x_scale, w_scale) +def test_splitk_correctness(m=4, n=2112, k=7168, dtype=dtypes.bf16, splitK=1): + """Verify that splitK > 0 produces the same output as splitK=0 (within fp tolerance). + + split-K accumulates partial tiles via atomic_add, which changes the floating-point + reduction order. We therefore use a relaxed tolerance that matches the cumulative + rounding error introduced by K-splitting. + """ + block_shape_n, block_shape_k = block_shape + scale_n = (n + block_shape_n - 1) // block_shape_n + scale_k = (k + block_shape_k - 1) // block_shape_k + + x = (torch.rand((m, k), dtype=dtypes.fp32, device="cuda") / 10).to(dtypes.fp8) + weight = (torch.rand((n, k), dtype=dtypes.fp32, device="cuda") / 10).to(dtypes.fp8) + x_scale = torch.rand([m, scale_k], dtype=dtypes.fp32, device="cuda") + w_scale = torch.rand([scale_n, scale_k], dtype=dtypes.fp32, device="cuda") + + # CK path (no preshuffle): compare splitK=0 vs splitK>0 + Y_base = torch.empty((m, n), dtype=dtype, device="cuda") + Y_split = torch.empty((m, n), dtype=dtype, device="cuda") + gemm_a8w8_blockscale_ck(x, weight, x_scale, w_scale, Y_base, splitK=0) + gemm_a8w8_blockscale_ck(x, weight, x_scale, w_scale, Y_split, splitK=splitK) + ck_err = checkAllclose( + Y_base, Y_split, msg=f"ck splitK={splitK} vs splitK=0", rtol=1e-2, atol=1e-2 + ) + + # CKTile path (no preshuffle): compare splitK=0 vs splitK>0 + Y_base_tile = torch.empty((m, n), dtype=dtype, device="cuda") + Y_split_tile = torch.empty((m, n), dtype=dtype, device="cuda") + gemm_a8w8_blockscale_cktile( + x, weight, x_scale, w_scale, Y_base_tile, False, splitK=0 + ) + gemm_a8w8_blockscale_cktile( + x, weight, x_scale, w_scale, Y_split_tile, False, splitK=splitK + ) + cktile_err = checkAllclose( + Y_base_tile, + Y_split_tile, + msg=f"cktile splitK={splitK} vs splitK=0", + rtol=1e-2, + atol=1e-2, + ) + + print( + f"test_splitk_correctness(m={m}, n={n}, k={k}, splitK={splitK}): " + f"ck_err={ck_err:.4g}, cktile_err={cktile_err:.4g}" + ) + + parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, description="config input of test", @@ -301,6 +350,12 @@ def run_asm(x, weight, x_scale, w_scale, dtype=dtypes.bf16, kernel_name=None): df_md = df.to_markdown(index=False) aiter.logger.info("gemm_a8w8_blockscale summary (markdown):\n%s", df_md) +# Correctness check: verify split-K produces matching results +print("\nRunning split-K correctness checks ...") +for splitK in [1, 2]: + test_splitk_correctness(m=4, n=512, k=16384, splitK=splitK) + +# Save results from benchmarks if args.output: os.makedirs(args.output, exist_ok=True) if args.csv: diff --git a/op_tests/triton_tests/quant/test_fused_rms_gated_fp8_group_quant.py b/op_tests/triton_tests/quant/test_fused_rms_gated_fp8_group_quant.py new file mode 100644 index 0000000000..0653eccc01 --- /dev/null +++ b/op_tests/triton_tests/quant/test_fused_rms_gated_fp8_group_quant.py @@ -0,0 +1,183 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +"""Tests for ``fused_rms_gated_fp8_group_quant`` (kernel in ``_triton_kernels/quant/fused_fp8_quant``).""" + +import pytest +import torch + +from aiter.ops.triton.quant.fused_fp8_quant import ( + fused_rms_gated_fp8_group_quant, + get_fp8_min_max_bounds, +) +from aiter.ops.triton.utils.types import get_fp8_e4m3_dtype + +cuda_ok = pytest.mark.skipif( + not torch.cuda.is_available(), reason="CUDA/HIP device required" +) + + +def ref_rmsnorm_quant( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None, + z: torch.Tensor, + eps: float, + norm_before_gate: bool, + activation: str, + fmin: float, + fmax: float, + group_size: int | None, +) -> tuple[torch.Tensor, torch.Tensor]: + x32 = x.float() + z32 = z.float() + var = x32.pow(2).mean(-1, keepdim=True) + x_hat = x32 * torch.rsqrt(var + eps) + y = x_hat * weight.float() + if bias is not None: + y = y + bias.float() + if norm_before_gate: + if activation in ("silu", "swish"): + y = y * (z32 * torch.sigmoid(z32)) + elif activation == "sigmoid": + y = y * torch.sigmoid(z32) + fp8_dtype = get_fp8_e4m3_dtype() + gs = x.shape[1] if group_size is None else group_size + ng = x.shape[1] // gs + yg = y.view(y.shape[0], ng, gs) + scales = yg.abs().amax(dim=-1).clamp_min(1e-12) / fmax + y_scaled = yg / scales.unsqueeze(-1) + q = y_scaled.clamp(fmin, fmax).to(fp8_dtype).view_as(y) + if group_size is None: + scales = scales.squeeze(-1) + return q, scales + + +def _scale_broadcast( + scales: torch.Tensor, N: int, group_size: int | None +) -> torch.Tensor: + if group_size is None: + return scales.unsqueeze(-1).expand(-1, N) + return scales.repeat_interleave(group_size, dim=1) + + +@cuda_ok +def test_fused_rms_gated_fp8_group_quant_matches_ref(): + device = "cuda" + torch.manual_seed(0) + M, N = 32, 64 + x = torch.randn(M, N, device=device, dtype=torch.bfloat16) + z = torch.randn(M, N, device=device, dtype=torch.bfloat16) + w = torch.randn(N, device=device, dtype=torch.bfloat16) + bias = torch.randn(N, device=device, dtype=torch.bfloat16) + + fp8_dtype = get_fp8_e4m3_dtype() + fmin, fmax = get_fp8_min_max_bounds(fp8_dtype) + scale_floor = 1.0 / (fmax * 512.0) + + y_q, scales_t = fused_rms_gated_fp8_group_quant( + x, + w, + bias, + z, + 1e-5, + norm_before_gate=True, + use_ue8m0=False, + activation="silu", + fp8_min=fmin, + fp8_max=fmax, + fp8_min_scaling_factor=scale_floor, + ) + y_ref, scales_ref = ref_rmsnorm_quant( + x, w, bias, z, 1e-5, True, "silu", fmin, fmax, None + ) + + torch.testing.assert_close(scales_t, scales_ref, rtol=1e-3, atol=1e-3) + sb = _scale_broadcast(scales_ref, N, None) + dq = y_q.float() * sb + dq_ref = y_ref.float() * sb + torch.testing.assert_close(dq, dq_ref, rtol=0.15, atol=0.15) + + y_default, scales_default = fused_rms_gated_fp8_group_quant( + x, + w, + bias, + z, + 1e-5, + norm_before_gate=True, + use_ue8m0=False, + activation="silu", + ) + torch.testing.assert_close(scales_t, scales_default, rtol=0.0, atol=0.0) + torch.testing.assert_close(y_q.float(), y_default.float(), rtol=0.0, atol=0.0) + + +_MS = [1, 3, 4, 512, 1024, 2048, 4096] +_NS = [128, 256] +_GROUP_SIZES = { + 128: [1, 2, 4, 8, 16, 32, 64, 128], + 256: [1, 2, 4, 8, 16, 32, 64, 128, 256], +} + + +def _sweep_cases(): + out = [] + for N in _NS: + for M in _MS: + for g in _GROUP_SIZES[N]: + out.append(pytest.param(M, N, g, id=f"M{M}-N{N}-g{g}")) + return out + + +@cuda_ok +@pytest.mark.parametrize(("M", "N", "group_size"), _sweep_cases()) +def test_fused_rms_gated_fp8_group_quant_sweep(M: int, N: int, group_size: int): + device = "cuda" + torch.manual_seed(1) + x = torch.randn(M, N, device=device, dtype=torch.bfloat16) + z = torch.randn(M, N, device=device, dtype=torch.bfloat16) + w = torch.randn(N, device=device, dtype=torch.bfloat16) + bias = torch.randn(N, device=device, dtype=torch.bfloat16) + fmin, fmax = get_fp8_min_max_bounds(get_fp8_e4m3_dtype()) + scale_floor = 1.0 / (fmax * 512.0) + + y_q, scales_t = fused_rms_gated_fp8_group_quant( + x, + w, + bias, + z, + 1e-5, + norm_before_gate=True, + use_ue8m0=False, + activation="silu", + fp8_min=fmin, + fp8_max=fmax, + fp8_min_scaling_factor=scale_floor, + group_size=group_size, + ) + y_ref, scales_ref = ref_rmsnorm_quant( + x, w, bias, z, 1e-5, True, "silu", fmin, fmax, group_size + ) + + assert scales_t.shape == scales_ref.shape + torch.testing.assert_close(scales_t, scales_ref, rtol=1e-3, atol=1e-3) + sb = _scale_broadcast(scales_ref, N, group_size) + dq = y_q.float() * sb + dq_ref = y_ref.float() * sb + torch.testing.assert_close(dq, dq_ref, rtol=0.15, atol=0.15) + + +@cuda_ok +def test_fused_rms_gated_fp8_group_quant_group_size_errors(): + device = "cuda" + x = torch.randn(2, 128, device=device, dtype=torch.bfloat16) + z = torch.randn_like(x) + w = torch.randn(128, device=device, dtype=torch.bfloat16) + b = torch.randn(128, device=device, dtype=torch.bfloat16) + with pytest.raises(ValueError, match="less than or equal to hidden size"): + fused_rms_gated_fp8_group_quant(x, w, b, z, 1e-5, group_size=256) + with pytest.raises(ValueError, match="divisible by group_size"): + fused_rms_gated_fp8_group_quant(x, w, b, z, 1e-5, group_size=48) + with pytest.raises(ValueError, match="positive"): + fused_rms_gated_fp8_group_quant(x, w, b, z, 1e-5, group_size=0) diff --git a/op_tests/triton_tests/test_causal_conv1d_update_single_token.py b/op_tests/triton_tests/test_causal_conv1d_update_single_token.py new file mode 100644 index 0000000000..ad59e58427 --- /dev/null +++ b/op_tests/triton_tests/test_causal_conv1d_update_single_token.py @@ -0,0 +1,414 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +"""Tests for ``causal_conv1d_update_single_token`` / ``fused_reshape_causal_conv1d_update_single_token``. + +``causal_conv1d_update_single_token`` updates ``conv_state`` in place; the reference mirrors +``_causal_conv1d_update_single_token_kernel`` (non-APC), not ``causal_conv1d_update_ref``. +Shape extras that used to live in smoke tests are folded into +``test_causal_conv1d_update_single_token_matches_ref`` (see ``_causal_conv1d_update_single_token_ref_cases``). +""" + +from __future__ import annotations + +import random + +import numpy as np +import pytest +import torch +import triton + +from aiter.ops.triton._triton_kernels.causal_conv1d import PAD_SLOT_ID +from aiter.ops.triton.causal_conv1d_update_single_token import ( + causal_conv1d_update_single_token, + fused_reshape_causal_conv1d_update_single_token, +) + +cuda_ok = pytest.mark.skipif( + not torch.cuda.is_available(), reason="CUDA/HIP device required" +) + + +def seed_everything(seed: int = 0) -> None: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + +def ref_causal_conv1d_update_single_token( + x: torch.Tensor, + conv_state: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None, + activation: str | None, + conv_state_indices: torch.Tensor, + pad_slot_id: int | None, +) -> torch.Tensor: + """Python port of ``_causal_conv1d_update_single_token_kernel`` (non-APC, 1D indices). + + Mutates ``conv_state`` in place (like the Triton kernel). Clones ``x`` only for + ``out`` leaves non-updated timesteps equal to the input. + """ + out = x.clone() + batch, dim, seqlen = x.shape + width = weight.shape[1] + state_len = width - 1 + np2 = triton.next_power_of_2(state_len) + num_cache_lines = conv_state.shape[0] + silu = activation in ("silu", "swish") + + if conv_state_indices.ndim != 1: + raise NotImplementedError("reference supports 1D conv_state_indices only") + + for b in range(batch): + coord_read = int(conv_state_indices[b].item()) + if pad_slot_id is not None and coord_read == pad_slot_id: + continue + coord_write = int(conv_state_indices[b].item()) + val = state_len - seqlen + + for f in range(dim): + cols_hist = [] + for j in range(np2): + if j < width - 1: + cols_hist.append(float(conv_state[coord_read, f, j].item())) + else: + cols_hist.append(0.0) + + for j in range(np2): + mask_cs = (coord_read < num_cache_lines) and (j + seqlen < state_len) + v_cs = ( + float(conv_state[coord_read, f, j + seqlen].item()) + if mask_cs + else 0.0 + ) + t = j - val + mask_x = (0 <= t) and (t < seqlen) + v_x = float(x[b, f, t].item()) if mask_x else 0.0 + new_v = v_cs if mask_cs else v_x + if j < state_len: + conv_state[coord_write, f, j] = torch.tensor( + new_v, dtype=conv_state.dtype, device=conv_state.device + ) + + acc = 0.0 + for j in range(np2): + wj = float(weight[f, j].item()) if j < width - 1 else 0.0 + acc += wj * cols_hist[j] + w_last = float(weight[f, width - 1].item()) + x0 = float(x[b, f, 0].item()) + acc += w_last * x0 + if bias is not None: + acc += float(bias[f].item()) + if silu: + acc = acc / (1.0 + np.exp(-acc)) + out[b, f, 0] = torch.tensor(acc, dtype=out.dtype, device=out.device) + + return out + + +def _logical_feat_to_qkvz_col_v2( + idx_feats: int, + num_k_heads: int, + head_k_dim: int, + head_v_dim: int, + head_qkvz_dim: int, + hv_ratio: int, +) -> int: + nk, hk, hv = num_k_heads, head_k_dim, head_v_dim + if idx_feats < nk * hk: + h = idx_feats // hk + r = idx_feats % hk + return h * head_qkvz_dim + r + if idx_feats < nk * hk * 2: + rel = idx_feats - nk * hk + h = rel // hk + r = rel % hk + return h * head_qkvz_dim + hk + r + rel = idx_feats - nk * hk * 2 + gs = hv_ratio * hv + h = rel // gs + r = rel % gs + return h * head_qkvz_dim + 2 * hk + r + + +def ref_fused_reshape_causal_conv1d_update_single_token( + x: torch.Tensor, + num_actual_tokens: int, + num_k_heads: int, + num_v_heads: int, + head_k_dim: int, + head_v_dim: int, + ba: torch.Tensor, + z_out: torch.Tensor, + core_attn_out: torch.Tensor, + conv_state: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None, + activation: str | None, + conv_state_indices: torch.Tensor | None, + pad_slot_id: int | None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Reference: extract b/a/z like the kernel, build logical QKV, run ``ref_causal_conv1d_update_single_token``.""" + num_tokens = x.shape[0] + hv_ratio = num_v_heads // num_k_heads + head_dim = head_k_dim + head_k_dim + head_v_dim * hv_ratio + head_qkvz_dim = head_dim + head_v_dim * hv_ratio + dim = num_k_heads * head_dim + seqlen = x.shape[2] + device = x.device + dtype = x.dtype + + b_out = torch.empty(num_actual_tokens, num_v_heads, device=device, dtype=ba.dtype) + a_out = torch.empty_like(b_out) + for idx_seq in range(num_actual_tokens): + for idx_hv in range(num_v_heads): + idx_h = idx_hv // hv_ratio + idx_v = idx_hv % hv_ratio + b_off = idx_h * (2 * hv_ratio) + idx_v + a_off = idx_h * (2 * hv_ratio) + hv_ratio + idx_v + b_out[idx_seq, idx_hv] = ba[idx_seq, b_off] + a_out[idx_seq, idx_hv] = ba[idx_seq, a_off] + + z_flat = z_out.reshape(num_tokens, -1).clone() + core_flat = core_attn_out.reshape(num_tokens, -1).clone() + gs = hv_ratio * head_v_dim + for idx_seq in range(num_tokens): + for idx_z in range(num_v_heads * head_v_dim): + idx_z_x = ( + (idx_z // gs) * head_qkvz_dim + + 2 * head_k_dim + + hv_ratio * head_v_dim + + (idx_z % gs) + ) + z_flat[idx_seq, idx_z] = x[idx_seq, idx_z_x, 0] + + n_repeat = (num_tokens - 1) // num_actual_tokens if num_actual_tokens else 0 + for idx_repeat in range(n_repeat): + for idx_seq in range(num_actual_tokens): + idx_seq_remain = num_actual_tokens * (1 + idx_repeat) + idx_seq + if idx_seq_remain < num_tokens: + z_flat[idx_seq_remain].zero_() + core_flat[idx_seq_remain].zero_() + + x_lin = torch.zeros(num_actual_tokens, dim, seqlen, device=device, dtype=dtype) + for b in range(num_actual_tokens): + for f in range(dim): + col = _logical_feat_to_qkvz_col_v2( + f, num_k_heads, head_k_dim, head_v_dim, head_qkvz_dim, hv_ratio + ) + for t in range(seqlen): + x_lin[b, f, t] = x[b, col, t] + + cs = conv_state.clone() + if conv_state_indices is None: + cidx = torch.arange(num_actual_tokens, device=device, dtype=torch.int32) + else: + cidx = conv_state_indices + out_lin = ref_causal_conv1d_update_single_token( + x_lin, + cs, + weight, + bias, + activation, + cidx, + pad_slot_id, + ) + if seqlen == 1: + out_lin = out_lin.squeeze(-1) + return ( + out_lin, + b_out, + a_out, + z_flat.view_as(z_out), + core_flat.view_as(core_attn_out), + cs, + ) + + +def _causal_conv1d_update_single_token_ref_cases(): + """Cartesian core grid plus former smoke shapes (width=3, small dim); seqlen fixed to 1 for single-token API.""" + out = [] + seqlen = 1 + for itype in (torch.float32, torch.bfloat16): + for silu_activation in (True, False): + for has_bias in (True, False): + for width in (2, 4): + out.append( + pytest.param( + 1, + 1024, + width, + seqlen, + itype, + silu_activation, + has_bias, + id=f"b1-d1024-w{width}-s{seqlen}-" + f"silu{silu_activation}-bias{has_bias}-" + f"{'fp32' if itype == torch.float32 else 'bf16'}", + ) + ) + out.extend( + [ + pytest.param( + 2, + 64, + 3, + 1, + torch.bfloat16, + True, + True, + id="smoke-b2-d64-w3-s1-bf16", + ), + pytest.param( + 1, + 128, + 4, + 1, + torch.bfloat16, + True, + True, + id="smoke-b1-d128-w4-s1-bf16", + ), + ] + ) + return out + + +@cuda_ok +@pytest.mark.parametrize( + ( + "batch", + "dim", + "width", + "seqlen", + "itype", + "silu_activation", + "has_bias", + ), + _causal_conv1d_update_single_token_ref_cases(), +) +def test_causal_conv1d_update_single_token_matches_ref( + batch, dim, width, seqlen, itype, silu_activation, has_bias +): + device = "cuda" + rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3) + if itype == torch.bfloat16: + rtol, atol = 1e-2, 6e-2 + seed_everything(0) + x = torch.randn(batch, dim, seqlen, device=device, dtype=itype) + x_tr = x.clone() + conv_state = torch.randn(batch, dim, width - 1, device=device, dtype=itype) + conv_tr = conv_state.clone() + conv_ref = conv_state.clone() + weight = torch.randn(dim, width, device=device, dtype=itype) + bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None + activation = None if not silu_activation else "silu" + cidx = torch.arange(batch, dtype=torch.int32, device=device) + + out_ref = ref_causal_conv1d_update_single_token( + x, conv_ref, weight, bias, activation, cidx, PAD_SLOT_ID + ) + out_tr = causal_conv1d_update_single_token( + x_tr, + conv_tr, + weight, + bias, + activation=activation, + conv_state_indices=cidx, + pad_slot_id=PAD_SLOT_ID, + ) + torch.testing.assert_close(conv_tr, conv_ref, rtol=0.0, atol=0.0) + torch.testing.assert_close(out_tr, out_ref, rtol=rtol, atol=atol) + + +@cuda_ok +@pytest.mark.parametrize( + "num_k_heads,num_v_heads,head_k_dim,head_v_dim,num_tokens,num_actual_tokens,width", + [ + (2, 2, 8, 8, 4, 2, 3), + (2, 4, 16, 8, 6, 3, 4), + ], +) +def test_fused_reshape_causal_conv1d_update_single_token_matches_ref( + num_k_heads, + num_v_heads, + head_k_dim, + head_v_dim, + num_tokens, + num_actual_tokens, + width, +): + device = "cuda" + torch.manual_seed(1) + hv_ratio = num_v_heads // num_k_heads + assert hv_ratio * num_k_heads == num_v_heads + head_dim = head_k_dim + head_k_dim + head_v_dim * hv_ratio + head_qkvz_dim = head_dim + head_v_dim * hv_ratio + qkvz_dim = num_k_heads * head_qkvz_dim + dim = num_k_heads * head_dim + seqlen = 1 + dtype = torch.bfloat16 + rtol, atol = 1e-2, 6e-2 + + x = torch.randn(num_tokens, qkvz_dim, seqlen, device=device, dtype=dtype) + ba = torch.randn(num_tokens, 2 * num_v_heads, device=device, dtype=dtype) + z_out = torch.zeros(num_tokens, num_v_heads, head_v_dim, device=device, dtype=dtype) + core = torch.zeros_like(z_out) + conv_state = torch.randn( + num_actual_tokens, dim, width - 1, device=device, dtype=dtype + ) + weight = torch.randn(dim, width, device=device, dtype=dtype) + bias = torch.randn(dim, device=device, dtype=dtype) + + z_ref = z_out.clone() + core_ref = core.clone() + cs_ref_init = conv_state.clone() + out_ref, b_ref, a_ref, z_r, c_r, cs_ref = ( + ref_fused_reshape_causal_conv1d_update_single_token( + x, + num_actual_tokens, + num_k_heads, + num_v_heads, + head_k_dim, + head_v_dim, + ba, + z_ref, + core_ref, + cs_ref_init, + weight, + bias, + "silu", + None, + PAD_SLOT_ID, + ) + ) + + z_tr = z_out.clone() + core_tr = core.clone() + cs_tr = conv_state.clone() + out_tr, b_tr, a_tr = fused_reshape_causal_conv1d_update_single_token( + x, + num_actual_tokens, + num_k_heads, + num_v_heads, + head_k_dim, + head_v_dim, + ba, + z_tr, + core_tr, + cs_tr, + weight, + bias, + activation="silu", + conv_state_indices=None, + pad_slot_id=PAD_SLOT_ID, + ) + + torch.testing.assert_close(out_tr.float(), out_ref.float(), rtol=rtol, atol=atol) + torch.testing.assert_close(b_tr, b_ref, rtol=0.0, atol=0.0) + torch.testing.assert_close(a_tr, a_ref, rtol=0.0, atol=0.0) + torch.testing.assert_close(z_tr, z_r, rtol=rtol, atol=atol) + torch.testing.assert_close(core_tr, c_r, rtol=0.0, atol=0.0) + torch.testing.assert_close(cs_tr, cs_ref, rtol=0.0, atol=0.0) diff --git a/op_tests/triton_tests/test_fused_rearrange_sigmoid_gdr.py b/op_tests/triton_tests/test_fused_rearrange_sigmoid_gdr.py new file mode 100644 index 0000000000..8990816ea8 --- /dev/null +++ b/op_tests/triton_tests/test_fused_rearrange_sigmoid_gdr.py @@ -0,0 +1,215 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +import pytest +import torch + +from aiter.ops.triton.gated_delta_net import fused_rearrange_sigmoid_gated_delta_rule + +cuda_ok = pytest.mark.skipif( + not torch.cuda.is_available(), reason="CUDA/HIP device required" +) + + +def _softplus(x: torch.Tensor, beta: float, threshold: float) -> torch.Tensor: + return torch.where( + beta * x <= threshold, + (1.0 / beta) * torch.log1p(torch.exp(beta * x)), + x, + ) + + +def ref_fused_rearrange_sigmoid_gdr( + A_log: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + dt_bias: torch.Tensor, + qkv: torch.Tensor, + key_dim: int, + value_dim: int, + head_k_dim: int, + head_v_dim: int, + beta: float, + threshold: float, + scale: float, + initial_state: torch.Tensor | None, + use_qk_l2norm_in_kernel: bool, +) -> tuple[torch.Tensor, torch.Tensor]: + """Float reference for decode path (B=1, one sequence), including GQA (HV >= H).""" + T = qkv.shape[0] + H = key_dim // head_k_dim + HV = value_dim // head_v_dim + K = head_k_dim + V = head_v_dim + if HV % H != 0: + raise ValueError(f"reference expects HV divisible by H, got H={H}, HV={HV}") + group = HV // H + B = 1 + o = torch.empty(B, T, HV, V, dtype=torch.float32, device=qkv.device) + h_state = torch.zeros(HV, V, K, dtype=torch.float32, device=qkv.device) + if initial_state is not None: + h_state = initial_state[0].to(torch.float32).clone() + + for t in range(T): + row = qkv[t] + for hv in range(HV): + i_h = hv // group + q_vec = row[i_h * K : (i_h + 1) * K].float() + k_vec = row[H * K + i_h * K : H * K + (i_h + 1) * K].float() + v_vec = row[2 * H * K + hv * V : 2 * H * K + (hv + 1) * V].float() + b_gate = b[t, hv].float() + x = a[t, hv].float() + dt_bias[hv].float() + sp = _softplus(x, beta, threshold) + g = -torch.exp(A_log[hv].float()) * sp + beta_out = torch.sigmoid(b_gate) + if use_qk_l2norm_in_kernel: + q_vec = q_vec * torch.rsqrt((q_vec * q_vec).sum() + 1e-6) + k_vec = k_vec * torch.rsqrt((k_vec * k_vec).sum() + 1e-6) + q_vec = q_vec * scale + h_sub = h_state[hv] + h_sub = h_sub * torch.exp(g) + v_adj = v_vec - (h_sub * k_vec.unsqueeze(0)).sum(dim=-1) + v_adj = v_adj * beta_out + h_sub = h_sub + v_adj.unsqueeze(-1) * k_vec.unsqueeze(0) + out_vec = (h_sub * q_vec.unsqueeze(0)).sum(dim=-1) + o[0, t, hv] = out_vec + h_state[hv] = h_sub + return o, h_state.unsqueeze(0) + + +# Shapes aligned with ``test_gated_delta_rule.test_fused_recurrent``; dtypes are +# half-precision only — long packed ``T`` with float32 activations tends to blow +# up the recurrent reference / kernel without tighter dynamic-range clamps. +# Each row ends with ``use_qk_l2norm_in_kernel`` (True for stable long-T sweep). +# One small bf16 row uses False to cover the no–L2-norm path (replaces former ``basic``). +_FUSED_GDR_SWEEP = [ + (63, 1, 1, 64, 1, 1, torch.float16, True), + (500, 4, 4, 60, 1, 1, torch.float16, True), + (1000, 2, 8, 128, 1, 0.1, torch.float16, True), + (1024, 2, 2, 128, 0.1, 1, torch.float16, True), + (1024, 3, 3, 128, 1, 10, torch.float16, True), + (2048, 4, 4, 64, 0.1, 1, torch.float16, True), + (1024, 4, 4, 128, 1, 0.1, torch.float16, True), + (1024, 4, 8, 128, 1, 10, torch.float16, True), + (1024, 4, 4, 128, 1, 0.1, torch.bfloat16, True), + (1024, 4, 8, 128, 1, 1, torch.bfloat16, True), + (2048, 4, 8, 64, 0.1, 1, torch.bfloat16, True), + (8, 4, 4, 16, 16**-0.5, 1, torch.bfloat16, False), +] + + +@cuda_ok +@pytest.mark.parametrize( + ( + "T", + "H", + "HV", + "D", + "scale", + "gate_logit_normalizer", + "dtype", + "use_qk_l2norm_in_kernel", + ), + [ + pytest.param( + *row, + id="T{}-H{}-HV{}-D{}-scale{}-gate_logit_normalizer{}-{}-l2{}".format(*row), + ) + for row in _FUSED_GDR_SWEEP + ], +) +def test_fused_rearrange_sigmoid_gdr_sweep( + T: int, + H: int, + HV: int, + D: int, + scale: float, + gate_logit_normalizer: float, + dtype: torch.dtype, + use_qk_l2norm_in_kernel: bool, +): + """Shape/dtype sweep aligned with ``test_gated_delta_rule.test_fused_recurrent``.""" + if HV % H != 0: + pytest.skip("reference/kernel GQA mapping needs HV divisible by H") + device = "cuda" + K = V = D + key_dim = H * K + value_dim = HV * V + + if use_qk_l2norm_in_kernel: + torch.manual_seed(42) + qkv = torch.randn(T, key_dim * 2 + value_dim, device=device, dtype=dtype) * 0.05 + A_log = ( + torch.randn(HV, device=device, dtype=torch.float32).clamp(-2.0, 0.5) * 0.02 + ) + a = (torch.randn(T, HV, device=device, dtype=dtype) * 0.05).clamp(-1.0, 1.0) + a = a / gate_logit_normalizer + b_gate = (torch.randn(T, HV, device=device, dtype=dtype) * 0.05).clamp( + -1.0, 1.0 + ) + dt_bias = (torch.randn(HV, device=device, dtype=dtype) * 0.005).clamp(-0.5, 0.5) + initial = torch.randn(1, HV, V, K, device=device, dtype=dtype) * 0.05 + else: + torch.manual_seed(0) + qkv = torch.randn(T, key_dim * 2 + value_dim, device=device, dtype=dtype) + A_log = torch.randn(HV, device=device, dtype=torch.float32) * 0.02 + a = torch.randn(T, HV, device=device, dtype=dtype) * 0.1 + a = a / gate_logit_normalizer + b_gate = torch.randn(T, HV, device=device, dtype=dtype) * 0.1 + dt_bias = torch.randn(HV, device=device, dtype=dtype) * 0.01 + initial = torch.randn(1, HV, V, K, device=device, dtype=dtype) + + o_ref, h_ref = ref_fused_rearrange_sigmoid_gdr( + A_log, + a, + b_gate, + dt_bias, + qkv, + key_dim, + value_dim, + K, + V, + 1.0, + 20.0, + scale, + initial, + use_qk_l2norm_in_kernel, + ) + + core = torch.empty(1 * 1 * T * HV * V, device=device, dtype=dtype) + o_tr, h_tr = fused_rearrange_sigmoid_gated_delta_rule( + A_log, + a, + b_gate, + dt_bias, + qkv, + key_dim, + value_dim, + K, + V, + beta=1.0, + threshold=20.0, + scale=scale, + initial_state=initial, + inplace_final_state=False, + cu_seqlens=None, + ssm_state_indices=None, + num_accepted_tokens=None, + use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel, + is_kda=False, + core_attn_out=core, + ) + + if dtype == torch.bfloat16: + rtol, atol = 0.05, 0.1 + elif dtype == torch.float16: + rtol, atol = 0.03, 0.08 + else: + rtol, atol = 0.02, 0.05 + + if use_qk_l2norm_in_kernel: + assert torch.isfinite(o_tr.float()).all(), "non-finite Triton output" + assert torch.isfinite(h_tr.float()).all(), "non-finite Triton final_state" + torch.testing.assert_close(o_tr.float(), o_ref, rtol=rtol, atol=atol) + torch.testing.assert_close(h_tr[-1].float(), h_ref[0], rtol=rtol, atol=atol)