From d33ef7071fdcf0e7859a0143a1ae47272cbcea4e Mon Sep 17 00:00:00 2001 From: minmengdie Date: Wed, 19 Nov 2025 09:44:19 +0000 Subject: [PATCH 01/10] fix output/lse is nan when kseq=0 --- .../fmha_v3_fwd/fwd_hd192_hd128_bf16.co | Bin 40800 -> 40824 bytes .../fwd_hd192_hd128_bf16_causal.co | Bin 46736 -> 46760 bytes .../fwd_hd192_hd128_bf16_causal_group.co | Bin 46912 -> 46936 bytes .../fmha_v3_fwd/fwd_hd192_hd128_bf16_group.co | Bin 40928 -> 40952 bytes 4 files changed, 0 insertions(+), 0 deletions(-) diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd192_hd128_bf16.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd192_hd128_bf16.co index 9890d4dfa6fa3dc7f6fd9ca75232cf07cfe735fb..482d44bd2122541ccd6670eb6f2a7bd15c188626 100755 GIT binary patch delta 656 zcmaE`kLkxgrU@F1KPGA}cho3l00S7!AOPVr0x32i)|dg|YwSR!3m{yE8IHxF@Q(1yuY-D40rlFBBoN49;XF8gjLJc!?F|lBXkMc83 z%q&SQGBQou+&9~Z3FwrAbM-ml$zt-)x#og1pb3ZcN8dP00S7!AOPVr0x32i7MKCy3+zCp3m{yE50eEM#RV}` zO}1oISHuv7ngQ3lxsg#`ee(*9n0yu{hSvR?C1wOL)(5ykjfPS@V49twU@EG(I7~br zRa^-s?uja{3l+Z*iYjga6VCuM>fy4^Fo|lo5SZZy6YoP6kA#Y!nS&~x0u>inf+`Mi zH#-Bz3S@C^28NR$J`)Fnz~sVN+v*KWEKL}45|dJM;tee(8X3cxW^Qn%qq#B6TxS!wsEdgaLwuBmSkcQD4iTQ zQ*`qCxssgl)G%3ap1Dv1G+rBKKs?gGKKb@s*~#VeBpDkfPn>7ZXfXLAkeo4DaK1ew J$K=5I>HwS1YR&)v diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd192_hd128_bf16_causal.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd192_hd128_bf16_causal.co index e3a45201f49da7d418bd192f53f85c1fdcceb2db..450955a3fba1471e24de8ffcb62703614a68a9b3 100755 GIT binary patch delta 712 zcmbR6mTARXrU@F18WS~_J1*#E00S7!AOPVr0x32iUa$_rUvLhUE`V?u5+(~WiVI?> znrz9au81KDH3P19b0gz_bq=)#?)?%9t@}4OX!^8>urajm_h4|l@cu9}!+QpXix2NB zGfuv+b`9gF&9&<$GuFS@3AGtY>4Ir?h72!MaSN!p$7)n@XQ=p#8dPyVsCWm20hf$~ zic7?U8TD|96sUwp8C(d=$cKto^rDJaLB&@rK^1R>iZ9uODn0=!{$Ve&_+-9~T#_~W zkcAi+mO+(Hm~6OFoN>Wq&yCmXjSQU}8FCVnQgh-BOkH41Lsvs1sHmZlIZV{R%oQeT z;ACJ1Q()i%6E!rmbcKmJo58dhy180HMU9M{oZ(CtH-;Qeu+2tBt|pWBD8_AO+x&_V z=&+Al^f}=vX|m>4b3uXg5U&WVhtMA;7jBWCJYlN@lfe4PtG3!P9+>=ct36Pw<~Dmy N0ca43PcGal4*-jCi)sJ> delta 693 zcmZ4SmT3YIX)p>*)Lib^(ait`Fq%OC!e<0hY(U(x4#Mv^he{VfxC|AO1sTN!F;q>q zWK>tg5QUln*Sooq@xS`!4o#mH7AA(){gYp;UBmce^VD^d8S4|gpcX(WT`H~D;@k`jv!N0xlMOeDOIGYdmS$k!14R%M2Sdl?#EsYM4NP4u z7;+MmQgh-BT@4LjOe1rcsDYU)Ow_>1zyzkizy&61XlCgQ6LmI&X)|)*fE)Nt3D?@$xL?KYA*Bx8u>rgK|I32IGKB^ n?BwNJC7FJ#n|x@i9pj40jN9yidL6gfbN$!|Q6&ym#mE2vdaQjh diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd192_hd128_bf16_causal_group.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd192_hd128_bf16_causal_group.co index ec05c4ed25b7c924dac087d852f5233ebf580688..fbfc6ffeb1a22c9ce307e1fefe55e2f436fdb70c 100755 GIT binary patch delta 736 zcmX@`j_JlbrU@F1HzsN>cU0(M00S7!AOPVr0x32iR#*?=E1XBA3m{yE2a^RE#RV}` zO}1oISHuv7ngQ3lxsh?ZGGqB>OV#5n9BK{R`y~`w_ixhjX%S&#Xx;C@;CA8tVP=N+ z3=9_^-dARv9I$Q;fB;xppGjC#0iK2+jF8C(d=sDg_B=tUK8g^Ft|MHQa_6<67eDn1)3 zUa${YoST7R8C2rWozMfF)B=6xK*DMp4=wi+-fd30h(|otcUooV6x&i cbH)jiBe&TzI!vAkBrPW21d=NzOKw*O0A<6B6#xJL delta 742 zcmccdj_JTVrU@F12PSGRcjV|{00S7!AOPVr0x32i=2#EmbDT$|3m{yEACmn<6T3DDETK7-RSht4p$7a#>)r^b`lP7M_ zuYceLwGB#HfN6Gy1FKQRouT3hwW#8LP;mx-RPjit_=I>=@f4`|fif_o9xj^?mH5yL z7XmY?pyCQkQN>%K;xe02#V0_;6Lz7B&xVRe>_Zlxyl*3y;S&@g28NALr9aTbRY5Vw z#KFKZ`RB&-^#%+%iAkwB@dgHN7BHqMoN4H43{zleWC>SbZU7TCa56H1YBMx&gNYiN zIXS~boh@M6j0_CSp`u2HE)Y@KoOmN67c-c!k((ug>B=zqoPyEh?~3M|&u)Ir$i&ez zd84Ap_f)&tYQ?VWrC>>CBT$97MNitSUF5G6%s4;mXkW`rb5lBv$ IY`9$=0Jg7*@Bjb+ diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd192_hd128_bf16_group.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd192_hd128_bf16_group.co index b2c6afde13876d6b9d029cada7fd3b4aefccaa5b..794fd00158131ef67e9b9c08e9549e4ea7f0c92d 100755 GIT binary patch delta 680 zcmaE`pXtYbrU@F16%#d=J1!_?00S7!AOPVr0x32iUN8f~U$6s}E`V?u5+(~WiVI?> znrz9au81KDH3P19b0ed^GULz98&%R-IMf=r_ur9f-M?8vGbUezjiGhF2ZP&%_lKDo z-ZL;1b)p5Fh1dnwVLVT4ZdPmYb0{`Gca&RpeCp@uCuAFBsB(M|W ueu0?~`UBhKL-Wj-1ZGbDHP4Rmz+}t$_KX^nE9cvD&X^5Rac}a$x$*!I1c8eH delta 637 zcmeydpXtGVrU@F12@^G!J9d;ZfB}qV5P1b&PGuPS7XtUrPcP6He(#eL3!jp67Npr%S)G>MD zJaeHR&`A6-1L8yu_Q}!nWG8>0C&>vDS`XH$Ghc;q#pJ;G_KX6PCj!Za$rtCV0{~O) BaxMS> From dcae84be08aef25e6db477c9af658068540417c3 Mon Sep 17 00:00:00 2001 From: minmengdie Date: Thu, 20 Nov 2025 08:39:39 +0000 Subject: [PATCH 02/10] fix gfx950 128_128 fwd_v3 --- hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16.co | Bin 27192 -> 27216 bytes .../fmha_v3_fwd/fwd_hd128_bf16_causal.co | Bin 30480 -> 30504 bytes .../fwd_hd128_bf16_causal_group.co | Bin 30656 -> 30680 bytes .../fmha_v3_fwd/fwd_hd128_bf16_group.co | Bin 27320 -> 27344 bytes op_tests/cpp/mha/smoke_test_fwd_v3.sh | 2 ++ 5 files changed, 2 insertions(+) diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16.co index 460d4d2ec22eed7c20a571324862332b8fb5e134..6e9af0622050b005da422280422b720165aa7bfd 100755 GIT binary patch delta 689 zcmdmSh4I1_#t9mX7ba>hcYI;b00uCcK>)&M1X653{2~d$f6;_W7eKfS5|afP#RV}` zO}1oISHuv7ngQ3lxsh?T8V7p=_x_cuRN_GqvIGOe2@scw zgW<(w!_@8du5M-wIf+TBIq?QYF0L@9iM#dMDXJ+Vg!c)%Viy7uZ5zP>%M?n3& of@kvV4B5%%nUahVlLIsDnK)7=Ps+4o?3jEp)1L9iWWg+T089^e{Qv*} delta 645 zcmca`g>lCf#t9mXJ0@x_cf4TF00uCcK>)&M1X653d?5+KztDtA7eKfS29pID#RV}` zO}1oISHuv7ngQ3lxsh?T+GZ1t305pj46XY&-$)8ztS_*G8V#j*z%)CUqTouwj7X^Xmt<7&6sY)- zY*g`lsQ86EWN~f=hAODUiOGhk;*1w2d!}x$H*#@i$Vp5}&51W~F|&X%EgTJDOk-z9 z7}M0q1jaPCaD_1~9F1X2OBW{?)7jVs#&k7?8|LZ;H^I%#h#@{|bA6g26VNFeGxRy( zNn!HK40Az;W{B$@puRpZ*)Y?b(P46CrajY#rQ0tlDk2b6~CV*>Jt z(FfH(*`HBRiLf1;n;4taIoKPx_Xi8K?%&L!Y2hlu#?ZQ7gTd_rlQ|Ou(AJC6tcw^X zJLD~4yuSHq-eg9{h$@)VAha%+VrM9EL>0GyiffdhiaSqcEEKQzgUTAXqRK`>#rHtN z4IIFg;=L^7&)E11h}#83f^0GRXyhy;UAIZOmX&xVR8 zOl~X`mwZu&D!UOX`(yIPLUAUJ{>i5buhknGx|lKKBqpWi#2Xk}xYw%O3g4Q3sX={k9h zV$kN+;@6B!96pmjmdH(>P%6L$Pki~4SCyIxRzTAtEND6=e=Id;teC7>X3wZGITJ`K OOkN2jCro}>rVaq5Lw;BQ delta 753 zcmZ4Sj&TAIX)sQhsJXoUgEs>hz-R^m2%iy1u>tXiJP7|oA1Yk{;W7w-6oLR$9}|#C zj6SIT$^MLjN`&p$+{D_Y{h`c3?cQ^mdo6K0B;|R7MPUwPJ>xLoQ(}&OgBSk zDAN!K;7k`27}Lnbh#^M=VzZGOLfFk>@*2gU&2x)iGctYfo@`hkI(dGnBo{o{?5iFOy_UnCw_)&nPgt5lC`OJ_sZ$CNq|+0|3YHgR=kt diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal_group.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal_group.co index 35a76e1b82b6e3d5cc3c7ffa59d0bbb954c97e08..b281ebcd087ccc5f8b456b9e836c133f9434d155 100755 GIT binary patch delta 773 zcmX@`p7F+e#t9mX5fe3+J9hXmfB}qV5P8F0Ou8yUANGfv!Wsp`$b!QQ~VKUko3|K=7=3s(^~hSvQW3~m>g%$XRN z7#J=}vo2zod?0TL(Bd=jx3N z7;+MmQgh-B4BX72OhZFAGnlBclM77Lz|0XQ>S*BxW12WR!HZ*dxfH92?Vb&QM8@f*3qZqf@q~r}FW5;C0GJQ^Xnw%V2 qW-j;xnz~>?$1!dmsbN7KTUg^8hc|KuBaOBnBNuFbDzoa|8~FqxxJ zq5h2{R1cK00MqOY3rbMMouT3;v8dvHQ1KP6sN#_@^}eX$DNyk*5nx6=Tow`>>$KS09~CNTjjexw{G0-_VfB}qV5P1SCinD~Fid#U% zS2%$g^>A5dsDywATnNnYgNpA6L=}&Oiu0gZkOCEd6NRcC;vaT~5{Lq*q%s3T6;y*u z4yuMLATARJL&D^hsXOb<7;+MmQgh-B+zd@%Od~gE7}LeV0>-p(HiR*aT^(UeQxmv0 zb4OR0sD-mJjA`lS1YJ9;=UfB}qV5PBu zB@U+986+}M#g(Aqe?n2kb)n)5oKVFrpyC`JsN&90@eP4sMm=2C4=TYD1{VS|BBA0> zqEN+CpyD|Y23#^9DsGbl7XmY?pyD171yD(D28JsjCKCsP$K->lJL?S?auSnLbK(t* z+#F#{7Yj2O)56IO#x!=dfH6%?j9^T2M`sw*!r2hUv~;tCF`Z4FU`$sNxM6OFMhx*$ zex{R;DkyH2N*7~d^q6dzsm}>d6_YbF%>{Qr<9A0g#Q7eRH)fhM?wI^C)1EP7vSyY& W(~H!}Sy^_R5or(wcPDSmkOu%P5o)9W diff --git a/op_tests/cpp/mha/smoke_test_fwd_v3.sh b/op_tests/cpp/mha/smoke_test_fwd_v3.sh index 23a980d35e..1c74659fb3 100644 --- a/op_tests/cpp/mha/smoke_test_fwd_v3.sh +++ b/op_tests/cpp/mha/smoke_test_fwd_v3.sh @@ -27,6 +27,7 @@ run_gfx950_fwd_v3() { for seqlen_k in 512 700 1023 1058; do $EXE -prec=bf16 -b=2 -h=4 -h_k=2 -d=$head_dim -d_v=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -mode=$mode -kname=$KNAME $COMMON_ARGS + $EXE -prec=bf16 -b=2 -h=4 -h_k=2 -d=$head_dim -d_v=128 -s=$seqlen_q,$seqlen_k -s_k=$seqlen_k,0 -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -mode=$mode -kname=$KNAME $COMMON_ARGS $EXE -prec=bf16 -b=1 -h=3 -h_k=1 -d=$head_dim -d_v=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -mode=$mode -kname=$KNAME $COMMON_ARGS $EXE -prec=bf16 -b=1 -h=1 -h_k=1 -d=$head_dim -d_v=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -mode=$mode -kname=$KNAME $COMMON_ARGS @@ -58,6 +59,7 @@ run_gfx942_fwd_v3() { for v3_bf16_cvt in 0 1 2; do $EXE -prec=bf16 -b=2 -h=4 -h_k=2 -d=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -v3_bf16_cvt=$v3_bf16_cvt -mode=$mode -kname=$KNAME $COMMON_ARGS + $EXE -prec=bf16 -b=2 -h=4 -h_k=2 -d=128 -s=$seqlen_q,$seqlen_k -s_k=$seqlen_k,0 -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -v3_bf16_cvt=$v3_bf16_cvt -mode=$mode -kname=$KNAME $COMMON_ARGS $EXE -prec=bf16 -b=1 -h=3 -h_k=1 -d=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -v3_bf16_cvt=$v3_bf16_cvt -mode=$mode -kname=$KNAME $COMMON_ARGS $EXE -prec=bf16 -b=1 -h=1 -h_k=1 -d=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -v3_bf16_cvt=$v3_bf16_cvt -mode=$mode -kname=$KNAME $COMMON_ARGS From 4018e0513fc46834a5821fe2b038599711e3a868 Mon Sep 17 00:00:00 2001 From: minmengdie Date: Thu, 20 Nov 2025 09:43:41 +0000 Subject: [PATCH 03/10] update the k_seq=0 error in MI300 and MI308 --- .../MI300/fwd_hd128_bf16_causal_rtna.co | Bin 29616 -> 29792 bytes .../MI300/fwd_hd128_bf16_causal_rtna_group.co | Bin 29792 -> 29968 bytes .../MI300/fwd_hd128_bf16_causal_rtne.co | Bin 31216 -> 31392 bytes .../MI300/fwd_hd128_bf16_causal_rtne_group.co | Bin 31392 -> 31568 bytes .../MI300/fwd_hd128_bf16_causal_rtz.co | Bin 25776 -> 25952 bytes .../MI300/fwd_hd128_bf16_causal_rtz_group.co | Bin 25952 -> 26128 bytes .../fmha_v3_fwd/MI300/fwd_hd128_bf16_rtna.co | Bin 27624 -> 27800 bytes .../MI300/fwd_hd128_bf16_rtna_group.co | Bin 27752 -> 27928 bytes .../fmha_v3_fwd/MI300/fwd_hd128_bf16_rtne.co | Bin 29224 -> 29400 bytes .../MI300/fwd_hd128_bf16_rtne_group.co | Bin 29352 -> 29528 bytes .../fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz.co | Bin 23776 -> 23952 bytes .../MI300/fwd_hd128_bf16_rtz_group.co | Bin 23912 -> 24088 bytes .../MI308/fwd_hd128_bf16_causal_rtna.co | Bin 28952 -> 29128 bytes .../MI308/fwd_hd128_bf16_causal_rtna_group.co | Bin 29128 -> 29304 bytes .../MI308/fwd_hd128_bf16_causal_rtne.co | Bin 30552 -> 30728 bytes .../MI308/fwd_hd128_bf16_causal_rtne_group.co | Bin 30728 -> 30904 bytes .../MI308/fwd_hd128_bf16_causal_rtz.co | Bin 25112 -> 25288 bytes .../MI308/fwd_hd128_bf16_causal_rtz_group.co | Bin 25288 -> 25464 bytes .../fmha_v3_fwd/MI308/fwd_hd128_bf16_rtna.co | Bin 26896 -> 27072 bytes .../MI308/fwd_hd128_bf16_rtna_group.co | Bin 27024 -> 27200 bytes .../fmha_v3_fwd/MI308/fwd_hd128_bf16_rtne.co | Bin 28496 -> 28672 bytes .../MI308/fwd_hd128_bf16_rtne_group.co | Bin 28624 -> 28800 bytes .../fmha_v3_fwd/MI308/fwd_hd128_bf16_rtz.co | Bin 23048 -> 23224 bytes .../MI308/fwd_hd128_bf16_rtz_group.co | Bin 23184 -> 23360 bytes 24 files changed, 0 insertions(+), 0 deletions(-) diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna.co index af10ab6df58f9f93bd944e230221f4874b055bf7..146a54651588c2de718355c3dad699132f06f1ed 100755 GIT binary patch delta 968 zcmZXT&5P4O7{+J9+FC!L#f72>?H<$?T$`jH$qH&~x5X~vLG&Oi%qDHp)~1Uqlk0N+*VGn!o(3^sL=$~L{W&&X|4A1lao;SnHgna%3Cx2n2 zU%6T3AS})eBMCSerazVrZh+at6hyrmx)O5dwByy&mqE3>5=2Q1#$$)N$P&D;aEQ`1 z{C7vsU^sO&{Pg(v>&d}*^gR8hG#>(kaSBv_ao0G&0kE4m@*a?6XB+O2ADzv<$X{|t zGLYttn^##p!BX5XBq}4E>l27rhNWNke4anD+c*3%D**( z{2t|Z?#>L^>_++x%J!}Ydokr-YC-;p^6Bj$A5ebZTiXv`Q@+3IqnUZ@aS_aE!Qg`z zfU`!B?@(SO(2Sq{`8b$Dj7}@}*_gC{@eM%1aWnrEG!OvMexL$*~4O6nIMA&(AjCT8T^+F1nK+owNQyctB!y vCgT4RGIyRzfLVFwM@uV~Z+o0Tn;vfGX|`72lAKD((jr_X$N6 zkA#Y!aRM{y;j$@Ei8Y>ZAuuB!D*hq}RlEu+9)e~;D^&bKG^+XuQ1L$yi=dL$3=FfO z5+XUM8kRxD6JP-gmEH&y*O%j}z?jbFZZM{+2}pq^ z*k(6V3#hQ6kpavILn9*>xTvw=#Y1Q_!UoaB|{M70ttoXAV)m z2VdLYfbY)1&;7acU&f;2kB$!B9P14F(V1=aWCU~uQ=q&j%nLvO;9LC2OF*Bmy@EZu zgrBsf)Mf8ZOeL{p)^!^Z&25-kOGl1{+=k{No3tEM)lB596Jpt}S5aNl>P@rm`8ulU zmGwGmnudi8%SIZ;$Z`;|Eu)EbC+XPLB*ynzmppPuuezfoVPsN@Y~(5#DXthPJ!fR) ztX|Qoil%6?l&NVIQA*cRIenVGfT!toJ3@bB3D&6CO1Ae2`)U@HQ5*-GvteFfe0wg; zuQ9%&hWTyAl}4D?82^5AB-(*%F=NH=6!80uXRZclBsz?f8)4pM{DTEiYv)VBYIHj z0tlC3#biN7aX}1KlPww56){AiX2A7sZe-l8%y@XSt?Fr(%>r6e99ftcTK7-hk-dU( z`{cx&el|l3Jp(<1$qN~!CKa$*sQ8&sRPhw3xDUiYgzfoI2^}v~4OLL_j9^sp*2#@|;`I}tvH>xuva_M$ zHL0lL%b?;@ppgNS-3S$b01Hv5#2%>lifWij2z?wP&ct9cIWb>c(xnDf+65HpOdJdm zlPBh%XEdIiSRh<)=xWH2lbDp66K`PTXbfW-Tf&(xCeAQX3n#d!i6NY6>g)tpVCV>E zI+?jdkHzMkFrN5i!xD3*2RW0oO6(XFCa)~9XPS^d`BjM; E06YbcdH?_b diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne.co index e4a46bd7257796e2da2a183f9f0704392c38fd42..7d95f3c2808bd9df84d9ff5b117ce749f7a95955 100755 GIT binary patch delta 969 zcmZXTPiWIn9LL`a+8P{UaZnVY?P;BK_BCmnC^{GWM=0VS7=j0ROWUODOq(uAx7JIi zSa4o+K5u%c9*Q>^y?C-m@#F>tuR97}1o3W2UIJlxJbs_w=lkB{5qN+8!qGp-4obJn z41_Bu_ShH`-`=}4mQr_t^LZ7Vb$e)+kUf#%_UKKpcH9;mkO+)Mj;$hB;6mbto< zrnei4fjhFb-j-VmHg#;R%N8~|Efd$|Ha5GRj&4}?irQnj(QG&DT)}mv+HK%wTh+0u z8(7vfted!F=xS3_%(Q9L(wer_TC&ZCQnQ>Uu4a)asQGG^i-{^y@@h`VDOI^9%AzcA z*}7cKaC{x*l|}LaULZ%?A@bFz43ZZ8Nwh3EH4Nsi2E9oA{PmzOP(ND<`a9H%8$th& z`nL}pAU{lYP^WHoGq_f#ezg+xPpC($K|iGay?eGFzNY@mBVRf2&h;F4-UtR`Hvre% z8~EWq^*cneqUYzH`GEu9XnK)sT2Y9|uJsdTP*wt(TvarTG6mj4PRx183qBSLC9>Z= zr}AZA<%&Lv*@Cx6ED0V;Qpv+2m+`QS$`muagnaZbkl_mgIZCALi6+X{ZwOQ5elN}X tFNo~+gvf}#g%N#UYviyeuwM6>MEU}A&m@)pGSlmlm;Gh+vL1ruX+|MsFhGk7fiD=@Yte?TR_FnIG~CgD&gY=7XmZ#q2d}LsNz*n@hNB)v_i!<#GJqGF?558x*51ICO1}^bHaGylMhy!Gc72Z{HxNA=|st7t13H2i^-K$>HvTig`ofd diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne_group.co index 4b8000efeb19d93c58481a201c9d1f4139d5eae0..d59522e81e0a98524a0635ce57105890eb81b291 100755 GIT binary patch delta 1034 zcmZXT&ui0Q7{}i?)UHL46$V2Q+A8WMXp{6u(m^*%e+VKfSdd+ON!z5|Oq(uAi|wVG z(1S2=^Yx|&6`4B>tn?x{D4slcu)@w`|AGevZ^q=!t8oe8`+Yv&C-3v}lDzx@CqH4Z zU$|LhAiU<=4u;`md;V>9?>4Y6SHZMXLreVDGt`}W_|&T{rv!I!2+oF%b>TF;6g@`C z8uZ=K2K;y)E{`%7zW4%D>nF!Mrv~Fu;PS!s89x||=Rk47EHQuq;9K~_Yk-f|c3~7h z)}Qow_NsM9=3=VWtjRiRNoKt%HDqLH$gE2y(pwD!Riq{|+O3wRn^s5VfmB;*)~sAc zRk_@*p_QhhAw|=Xq^d|WP)pa86;(E3hF*!OYNv6RDQk~4|&9^nT!D6-9f@L|czw+KJ$44~n2!o~X( zS>`SPJ0$Sc{|x|mPx!vGGaBv_4jxfto6m%A;_v3{=@F_qSKDZN{3M&CwJ@q5uE@ delta 838 zcmcccjd8(OM$URyA2%jOMhE7Jobrqc6E&ARPVr{|0~pO90O2zNDK;RUQ3T=7n1o6f zK)4JUlLZ;Y1u;}jwq#US#1Ms=0oS{^k#V;&WAJ8M)zd7S1+=cXvoJBV?w|ala0R3K zH71VwxWdWwy8G39{#hszzK8~p3 zeo*lZS*YTXP;s7cRPhw3xDUiYgzfoIi8EfP8mge;D?(7kTPGhZ5wD*Bl{JV%m7NV0 z|C5d?z6>h91!5gsawAlHK?z(4%-91Jf6<64ejFms#85JMVX3%eOB1TJ3nk9Tk9cc?u%#3~dS7Yu0e**^{7}?g%0z3S-e@EmD9NmyS`ohyU#4 z1)NWwoPWMB-C7t;CofZP_{9(yO_QMhi@DAK27vwei4TAz8oO|V>@~KBJbT%`6Unr0 z^jc~knRw9d;jW4-0}a}Efcj?FLNe|lYtuB1{=ja?k8rEgYuUMq6jj=6p-xXTkY@A| z)^%i9$m|Ktq!4+wwmNLL=trWm?C3l=eI>|6$``K&d4=+WVvygW{2LDP2bABv z;~28N9v&)`9h8GzgYxW+Ab(8x>~@fkC~vrX`{8TK@89#0W8S))2XiPGd~^fw&OMnQ zexI sWJMOD&8P1BHP>9-A!Atxd(&sKFAGd$ot#pxF=CUf+Opduuapb_0TqA9!2kdN delta 810 zcmaEGigCk9M$URyA2%jOMhE7JobrqY6E&ARUa@2V0~pO90O2zNDK;R!5d-1hs6nL* zAY2BG$%2gHf*7hMTQaIEVu(V`fa~4d$jGj-d4lE?BNirx*8P($VpcGgPktEF&t_<$ zXP{>=c_E|J3n759UR`vjqi zM?%HVIDr}UaM={7gpLPX2+YWbioftj6|aJdhoD)|3Kb6sM^!%oD*h)9S=^d|VK!7^ z3d99)*=11igmkzNn6VKmt}%IIytw3+98}rkP}w7sAI6I_-kAI|{zko_vmrxHVp3{O zyn&I0F^p+!4re+!I>SWGUErd|Zg8fli4$Cbu_K)6VhUqg8k)eEj&5!+rn4DHfhO2y z7efn}u&W8&2v;)~xTv||xF{wGi)m#6svD llO0pcIbl5U$&D%IObM}*ccs`db;M8p1*Cq&Pqs=`0|4aIkR<>B diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz_group.co index fbd5eee3089f8e5c2d33d1bec1576f5c2668a380..7b08aa9b137d9a9ddbdc3e4495e5fb13504d8d71 100755 GIT binary patch delta 1021 zcmZXT&ui0Q7{}i?%&uh@i^5QZR))F_v`zXW$zX16m!dFa=s<*ZuWx>J)wGFiw}T*c zU~hsQ?`|?s55t?3dQxF}5>#-42M;^coBzTtVoct=7?%*f-{j(g5!P-U5U7tQ}Ju(!GG$Pphx0xJbt8$$ne71BQ$tM zaPMFfzMX)v!R)!uBhk@^V|yE8-Mv9{dRsUd0o}bxP~PX}IKTn$EphAxAkS7_!h7Uu z3RBMp(h}v{ zvjOq-8s*D{Fuz5)q=vap`S9EwM!LmPz@%xm&zQT}ax|BOhVcw(sz#qWErTrypjp5P{;)~BQskpDFGGBjAnUebNL0&F^c`AG+j2qA;~SlOirqdEmqh>*7+?RD$a zMHC_kFF`@aq=fnt?AopWAa!RZh8C9h-tW!VvQz8g-WxWT}8@rt2to)UUW-O&iA z7t!#c5n1#a_k*ebX0m#nIpgD@UZN&gvCaB#6a?}7%s(Y-u(|CDezIdjJ2d-lY+D@H z9;1M&FYvhC#K)jyFHNHCbUWQ}izlyC6Wr%KI0;`lSNWWrnj8Yj!(;?qgsA8-N<10n zeT}%g$om%Y?gsBF@ua@s-|h#bakt7(I3d2f&U+Oqd0V+gx}TeT_lEdsp7#dvagFx} z;=LW-pNTh1yua!$ATKBd+eB=iZ$}0{T^L0fs24u41d1|N7ULo+DMhW6cYDGTO{E}9 zBhe_;K!T}PWU0>|VFP?&mL_~O4N6{`M#Kb7qw-+D!f@x9BvX$2*%EO%#xy8GswlQo zMZ190aHP7-?7~7z4Olzm>UFaEC(H1o2F$dwfv5$9UKaMWB|Hr+ZCR)l;9Hyf1!mQZ Ai2wiq diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtna.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtna.co index 092e402ad1e1ef8556337218c183630f91c535f3..9c34820759cc211abac2b3f4e18c6e45c829f54b 100755 GIT binary patch delta 954 zcmZXT&ui0A9KhcTy0*H5bHbdU?L=AqF`H&tlET>fV=4FpsR+Ule7v?VZPT;~N#?;z zEm-il@h&@f(Ss+!K?OYs#s9#aoZz81clAFoZC)aA3E}(se!k!L4eya0|A6~HVWOu# zD02|zPwgd=+{M=3&G&_I0ZgwTf^&Wjy)ffFhMZG;U2 znGUjSQ)?N-$vAc`V;IjnE1uCw&7CGGP2?3>ld7VSR5YGQ+M2vZs<@_L1vXeGgoF6fKn4 zf?xTlf5`lsa@21#|J82@i~d&-1GoR1Z?gW^PbTm*><2GcVgK1bH=zOZpGeg2GJoVo z{qZE_;$Td_X`g#rB_0^KN({tjl~RC$sD@aU0xagl0A(c?;HoSKD5@b!YB{j7QVdX0 zg#ZicWbXX*&85N?(^+1Y=;6}!VWzpr!7;t7XAV8&d~e%W%^rR ir1^Fnex!-E%mIxS+Z*tJ_S>6@RMS6zKlGk{1%kgdA-a(O delta 763 zcmbPnlkvrMM$URyA2%jOMhE7JobpT=856aZIi7H200S7!AOPVr0x32iK9K_9pJ+j) z3m{wui^+nF;({2eCR;M9D`JR3&4BCO+{mb{zIlrVhYbr8L+k#@7g8p$8CvKW=ow62 z$S740asvYcJA;NL)G{c=1E$#-3~W%v#i8OC>`=v(pyD2hsN%X%ahX6=aSMpJ6T=k; zR0(H@1j7_JRB=D3_!bBQE*Uv_Vw!k8Bm~(R8p7ZbU`9Sv*MvA!@hYhJnp9NrR;c)o zOjPj+Q1JsLsNyd`TqX`Ch7*$u)3?_fS~xP~BqpWi#2XlyIK!E4CNQRxr4fv2Zeb2% z8k?KKm?o}prkSe?Ow`=M62`P}HGnZ4UE$_BJ30YHg~2wvIJrVq7#f%xOune7y7_*F zBopI_$v?C7IpK+HvSGHlkOjoU3>MIEIKegfcb4qr>DiK;Fd^~DJG0FvZ^-7D{3F|f Xkzul7jy-2VIwTlAPX3i8$H)KxA9#O! diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtna_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtna_group.co index 48ac9e54a57f78356f8e1e62963dbbb314197573..acfb38006131c73c2e933c1978d55d7432f21e14 100755 GIT binary patch delta 989 zcmZXTPiWIn9LL`ab8U2oZA?Z5={5)L)Y&xslLUq{X*VyTlL^+-BW+*OHfa-*R)&}A zq4$k<=|KcVWH*&wyj46Eyttc)FpnNQdKJu?mq1v;!UCUO49>5=^b0qjW~t zJ9+^#*I;V&aowGX$DUsr@6Qd!qxFsW*3v8(jMHHE8?(&-27r^~g*Sk3_^{7&x4d^H zoiVMBjvZvH-9|@kW8_+>+fcj6vD+@HsU76@Y}<0Wo}*@{db6W@b&Tq`+S5_9qghC^ z9Hg2ivRq_4mew?}n{l05#x$R{H$BqAweB=o(ej)qXvJ!tPl_6ci&{Y_;Hp{^RZ$iA zd|j<(`COeV;tlc|ULn75yua6@UxJwUn~VOmkqs&!Ut54g+S({LLduVCP3N&BWhU4mrT+p7!G98*#S8!d delta 819 zcmbPni}A$`M$URyA2%jOMhE7JobpUBGA3#*b98WG00S7!AOPVr0x32ic1VTr9a>T8 z0tlDkz+^#2aX}1KlPww56){AiX2A7sZe-L~W>nm~QKgt=vyLW*4GR-P>;B0esT0@? zE%XfZ3??sRl$tytjicV86lx!o;sMj_3=1q##l@lGD{N53m7wAt_Nd~zQ1JtasNxn- z@hO2|Mi5-q87kr92p0k~{Gj4I?x^CCP;ni9RPmI_is|C@`B2#(VW_fIP;riURB=cE zvNNcpp^8s{sxQbw75@U_GI21#Vh<)e8^{o1W}o~&eNnxID??6VQff}TfuW-XjA>-% z2xA%>8o`)OP6jZhxuYqJX>4f>W11SkndSygFi~?ya~RXY4Q`v00oYt2kj>7{mM|4A zF3ywhD{2F={^p*{H;haUPLmtcL?`!WOLD@z;V^k)wz<#-XmZ++3h`C|*W~JK*~#KL slAJIhaj;gu9Hq$yIUJKKax6G2(jkfqpoVfzz7ACX0xJ3!Cd$YF00$q1ssI20 diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtne.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtne.co index a63a8c29409657ed855c3a85ad044a7ee10b310c..30dff18af46fc6fd43b6447737c6f126664cb725 100755 GIT binary patch delta 954 zcmZXTy>HV%7{>3c(j)>TN|D+kL0lx104i~u4?9xTrum{s2qdN=hU>+#leCU~u~P;Z zQjju0msIx`z7Wa51;|RXf88|m{W*5!D*y0(= zYD~@!Ucx)EaA^1Zuz0aI9DIzg7p_c$-Y^a-hr+4=1OSer=iUMOcdOr(rMun`;|bHY z>bipn?ldgi(vfSUP6Kz4Ls~AP{UsEyG5J?I3KL z$aWENY@=!FZo+l6glRr&EqP>1*E)U>jkKgHMz)%kqpBh4StFxl^eWa=tYSq@*Kjo{ zr)pAGzeV4|tKCu800@CEE!G|;7kmu?DaZ3U#?@t>-(dWBh389*A8I^T$Gi(_kNJfn zJMb;f^M{Nt-{bic#{WIyd5v-Nzxp=gjg66c>L-7C?$04Gk5Adi^%(!!;CY|%`4>F@ zHHtzQh!nOycgqC{n7FE&NJ^^3z{=_J#8$F_EvC|eRZ6+QrW7Tx8L1LjC3R{`g@EQV zYk@7Oqq?*HmrF{9;iN2O>CxiNfo&}aP^7O(A~Lxs^nfUr`|OwRvui}A=pMO7W9qL3=f_X^k83hrS`#AvggX+O%8% delta 763 zcmccdlySuqM$URyA2%jOMhE7JobpU7@+N96bL8+~00S7!AOPVr0x32i=E#QdIl58l z0tlDk$7Df9aX}1KlPww56){AiX2A7sZe-L}-@HY`!-<87p>_Xcf$Rxvh8B7TdIpmh zGD_8h+`z!V&Y)olwG2w}fN6FH0~=Ivaj5tOJ5+HcsQ8IwRB>IXcuWwgxCKPqiQ$R^ zs)RE{g2BWcRoo9M{>2|vJaY2I9P#=TsBA(6s%$<~Tp$ruyb3CQCIeNx6)L_X7gc-$ zRGgt2Rs02r%f!LNz%hAa?)G{^3rB{W#H7@mcmpF7XE@W%1jckWHi9wDUCd!jV{=m& z)6~EP#xyrZFkLKRqL!u>Fs73Q+*}uPW1y%o*k)HpSEvd@BLjoU7Zp`E%jZinF>*|{ zEY#HV%7{>2RX&ePQ|{!Tj+RT<~J(%A&hR9WNfe2&?IZphaSEC3dQcNN{8I6s0qQ-thrU zUxOdV@7L`GHu8LFQaV4FjMr1u?Td?GFxdo^PfU>k3;>@KXWsx~c7}Z}bHiQ9&9r9d zO~pbcc3L{_D#$jF)4~q2%&v{=NxkiAfhxrQS$5NQDQU2~`m~TKL2ypho+@SS8Li6td@RTOLKL|GrC?Dc5A5y;C4)Z^= zkYT}yJn5eFxjfKtP0>)MTq^rEE6cv+3MJp>vO#T$=X@$u1mET)-nTrr<6D9MZ!3~d zB|)6o<=M&Gaxi#D%8_rY*WH~SUon0|W>h6H?Gj1!;?dyF5V!BQD0FFwGT@1fN6Gy1(vAd;!yDwHmKrCP;n1?RB>IXxI+r6xCK=F zN)VUn098C?a$>G{eLht7Mg*#C6;!+<5mmeu zD(;erDn0=!Zjgs6{sqKk;$VQq9!z#NkRim(KG{BRQN5unLr!8+YEHa?p`!(iX=LUI zV;UP8!I;jb1~8_%n<$3tIbq&tnEbKOT<8NdIema8Q3008y9;F}hZjk5 p!i2=ZTKkHWCKnWOOx{ss!MP(B5(WiOLpdjl7fUi~Og1c52LL?yiK+kq diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz.co index 6edbc54bbd86344185b6ea40eff13fe2820c6f1f..7ef5602d630abe2e9fbc0201f0dbdf58cf562af4 100755 GIT binary patch delta 950 zcmZXTy>HV%7>Dnyk`G{Lpi-p@Dd9tE_(B(qpOzW1qMbHLsF1dCn`orpMxz2nvPAe@F*f)q)><-~BfT zsjD4Lv9gxg%xcQ570Imy}di5!4NFLuufqge5F;dn+Lm_0prTzeWAoX4q$_uhha`qW;b^2ZrZ*GHv|c zJ|`WVj{_hIZj@XIPWvuJ!Otgwa-l{218-u`)Txg>4-{~}mp8QWe(!%{P9y61mtjAq z{!9t`Kl74_0h`QrzYdEM3pCtNG{oj4*+-5q1y~k*EaZ3}1v&5Ix+wa{O92X8*|&mR z^ih;KAM?U|-6i+R1^KbB<)Q${GR1~IrQ$Q^1{m#GOLMk ezxqO8U1Y#6d05|sC*-KU6;FAu0Q@DhYVsdhx3*{i delta 754 zcmbQRoAJR;M$URyA2%jOMhE7JobpTwkrTC+Ii4_L00S7!AOPVr0x32iJ`oDxpD0D8 z3m{wui^+nF;({2eCR;M9D`JR3&4BCO+{mb{zIlrVhb{{fL+k#@7eXhn8CvKW=ow62 z$S5_rA)2k8ok7DAY88~?0n_XZ1~#bT;!yDmcBtY?P;rlFRB>IXxQrjFxCK=FiUW{= zY=<*cLc|r!fD;h?>0#y7!DysN15SNLAiQ&ZL!ieqlh8B(tIf+TBIq?QYCeCoCn+c5RXlMjunmL=p zn8xO&Fs6wGoM~p{0uwcJwuCXwoef}2OEZMI&Q3s45wOip=B_Xm&Q1mlllLnqZ@w2P z#Kd@F^2ca>PIwxdtQli2Xi*07kVP1T2Kpn$oD;?qpS&@~oGBn|@~ap-Muy3nvG$w= M(2#sTxiDHD0CZk?6#xJL diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz_group.co index b6d3e01639f666670e1fbf1b25cbe5d445c42f55..ff75aceb0d6c3ce84b88ae1c956705e2e4c7a964 100755 GIT binary patch delta 989 zcmZXTPiWIn9LL`ab8XbaHkDC9y3N7}-tENq8x-qdKb>Nd)?WEXyW21X!qv9`Zs2o0So}g@iQ+0;c^Epo?CL? zsrjU7)eWtIY~0+gy5Z(ERMAR12CCL| z3+Yw^VbesGgY1T-S53`HI*oGDGQb%p$xUUb(|_xsjw$m9iboV!GR7Gi90X9z52Ci?X6 y@6#J4v*c^x3NeZ?^0^p=Z@sqX#TjPrF8NZt&d$^0vFSXP%-xT%9ZLWD6oP-UKg0I` delta 819 zcmbQShw;TOM$URyA2%jOMhE7JobpUBA}4Asb969f00S7!AOPVr0x32ib_j#;9m-JY z0tlDkz+^#2aX}1KlPww56){AiX2A7sZe-L~W=!0?QKgt=vyLW*E(;Sw>;B0eVH4O4 zE%XfZ3??sRl$tytoTJ_%4Qd~h;sMj_3=1q##l@lGD{N53m7wAt_Nd~zQ1JuNsNxn- z@hN^_Mi5-q87kr92p0k~AnMo|dR$S(BcbYbd{D(xCM!mW*XKiJD-afdMXI3U91!hr zt*ub;C2?>eFk=E#ydV)({1b@F#K8cIJ(%olAVY|mee(Z^MfJ|E3^|EOsX6fmhK?36 zrjeN=jA?9W1Y<2?NTs@^5=&vIvO$y;DQm`dg0X^sf)U3#yIA$fdhdO{-*+dS?j8Jwvp+C2s644M z5UyX?55<_8zJKqixK#zt=SC1-)X)wA`>(5uy8gi*@uCDHvH~M3=Uk)!@7_K~=^kG1 zWEU>SgG=w0r&lfyr<0F~ziZb5U^tC~>M`?(0So{qv5Q{;dEXwvI$_nlK_-3QUaRqx zX6P;1L|E*%b+IQS%Rv3M*hePrSx6FfWNl+?n0?zJb;Xvdw`^TTO}Vk%LaMG9NHI(# zY8o;ugiS+HHQ7p8rj*jOSG}Z7I{QM2&-=F#Ukn+*u08jixdHbb@?BqEBpIt%vUjX2U@><8w$CQtD z{QL*yIkMZ0M9Z)JYI6QinINc>arf7tvXKLt*pM}pW=me05jdA-dBIIfm7<#poJX@| zkLEeAjW3qYyq=+O<}I;vns81omowDNvYcyXS)uIF0y|-Qj~QmGN>0@4WY36(y!j$y zgTFeXlQN^zCj`h_;}(hFD8V=gzd3E2_zttvAtqi69cqhP!F0b6`H7>K!t@PtaNZ5U EKbn2a7XSbN delta 839 zcmX@{m~qA}NBy z&@<38n7oitYVw4vfO-xqsC7_E7fiD=2-u>ETR_EcIG~Cs*4dJ3LE(SognqZqNltj$oSax-F0=uf$~Hh#ZUEb4?E=}!$AJoA tLgHYp{|b~QKPcdstWjve`5_w;GzCyYIVZOlN^&LSKt#>e<9tS zzavr*-k#W^BXGPm_aVP80{iDiFzwXP3IXdsRGoVG(jQHy1Pq>mp_wC7I14Y&9#L|J zx7&XX2Nz+ipTG2NA~^Nz_~6vZ-k^Wt%J|lq0O$=EApWFs6rce36`6boa8TWW5Am+L z(dF6e))QsonxWU^7BZ#IsxCET)HYCORqCLY*=VDRq@(t_X&SALHBniUYN}qd`ZB7@ z<@Fj;b;Ur6(L$1@A)}4VmZ7Mc+>W}8s5ml0`kWtd*Bp(qJR!%Fa zOiC_G6+sfD44p5IjwYo~8tEEmJHW&e{XHgEwanhl>esuO#whnBxRjIWAxFw7>zp z6y^E>Jw{RcBEDBYi*sg}_P#MJnVHiIubg{XAoufW5??hh;IC#FN1H+T(O&y!#^4yw aHgmMlaK=q?9F1Qz!^d`rF2RFk2>t^T9Nstp delta 879 zcmezIgz>~S>nEB3eJ}SeO`E_fJ;HT)`MO z`DA84o1ulCfu6zSg^W^@1+pzBpU4rI9FeV1-(dx{9ZFe%X?BJQwy5IHQ1J*yRB=D3 zxI#LrcqCL@A{13T1uFi)3CyU6%jQER1U%tFU`7>GTq6ioycH_0fM&r2sQ7~@RQ0oA z;>pP3)(i~Gpb`hNkR=!xHbTWc@=(S1K*a?nZ_E*woKuM^dkHGLX7a-vamE9af99O4 zcXVOMNlZ%3i8nAbaDp>kjbKb8M-v#+*wPHfbTP1iF)b|MqQ-7;QBw=JHgji3gj%=) zONd%~kj>74BqvNr9ISO+q0;0Dg&dP_6k2e0-bY+fYE^#dyE%Lp=pkpTd}SD+yP diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtne.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtne.co index 7f5c12bc05ae00e5112b5363b3aaf7446b86f405..400da07fb47337980f25d70f84a1f97f68115fa2 100755 GIT binary patch delta 1054 zcma)*O^DM#6vt zje&6W*j6aQtm|8scFgTMupc*qB`1egSg^ja?BwGOe^i_lbg19$V=VqZeYBt4hRZAe zo-uh27h}P|vlG)_Cr8uC%Q&i@SpcJH4Aj0c1qN7CzekS01!S-P4(^eGykWAb%hnr> zB_+Av7F4Y8dZ*9#1gyze@9;WSm7a#1d>?B=MUhqAax@?F?QXwq)dk!V8pAg3_C*m$1kqp{d(RRLrIjC*~VjHiE>IkQZ}YPH6B=#3P<# zvn{@nW;3l+UWk)7@B)b{Yv$&FZjy$C9RpzLs-Gju&tLQNGUc;XKfgtJtn23wD4*T4 zh3QpWw9qMe$QI542mo}`&!15L;T=C8QGVbI@QRz1-+1UDOWXzEH7y(ne-{AQp}f=c z^Ig~f)XzUtu9Gi=bKw_%{>p!I;bk`(~|}1rpy7B%Df@J;|cHOEVmDWd#*o ziAo;LR6LqPRoB4P%C1Bu&rr;kPA9C(FuN z@^u&vc^8q)hurC)vsDnJ_vz;;@@aUH+&9AHgAs)9?Y39O3j9cRjRKsJxlveL)11ZS I;gK1F-%<9@-~a#s delta 867 zcmeD9zwBWJy0ZFY{Zb0n3S3mZ(w9$ z0%IDRo57ea78Wq3g%e!V*aa?XYT*o1VD9D!V_G=D6*!t2z(k!9qOO)81)5-+-5g<# zFf?+5Il|D`&>X4{$aLDQSmMsaRO2)GV1el5`Z7sQcp{#>u*_WW12jp0fF^MPs5;Kc r?B$Z2Fi~-^s9m|zWQB5$$rPMIaWTwJw3}IoT{+sve7)JOn{^S^ zM4?-U&9v`d^tFIARY9H1ZSsu9xgapJ&Q(GPa6d3gcz=hwab z4(1!2mp{OKzv<;Q$A8}v=<`<4P%xY2z23)|AKdctA?E5mFSl=uA2jT>-Qo-EKYZvS zOYGU41`Wwucw-m9tmWnJFmE7EJA=Nfq4VD`@eM5D{)+SKOa?#%J=H!LX$Hu=Aj&XB zrSgs}7t)TTO0J~2tV5ZrtRssY=Sa5bQl{+E3|DduRKb-+%dkG*naj}*F4%ZZOL=Dr zilP2Wrii^1lcoTw#HmTSlOV}(1&!6y$mm6b?tG%%UhdQgPwNPe+4d6J?43jYeiUu= j1H@~qZLOam*)1gW*Mi%+eN-x5FNi+$qr?pT>|go=jx*ZW delta 928 zcmdn-k+I_gBWJyi_sKfB}qV5PV5l?dChxruR~GUJ-fj;g0wHj8LkxUw)YwCS`!u0jAj*CfK5iJ43}I98tynpyCTM zQN<%+>cddQQ=sAxoWP8FxNJUDVuL4K2+XL0if06)inl_=6VNPx!~#3Rff!WvvtjDf zkj1ST7?wdL9^@iRFfeR{igOg9itmAn2TcB0ByM;I;RCSvC8+EhG;tSDa$(|NV3=%K ze6HTug&`*~DK#hFz|g=6&U7_`F^wEeU`%66GZ@pw(E`S_aDj^&yTL_G9i8C{3?1Q2 z7q|jPa|4j5J;-Kfgs`gAM8XKEKO#?EW7$*BF7;gSnqQu1X$7ix* zq3C4&a!F2j`kx$FZZ1>+%?$<6WITa?vUIuZH?^toRiBdBpEFxPpnV}0GBeC_y7O^ diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtz.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtz.co index 302a658023c3c8e1eee2ea8339d655c9aed42d4a..ada071557ed0b1be6e35cfd5fc42803de1a62aa9 100755 GIT binary patch delta 1030 zcmZXT&u`LT7{}kYaRassGaA6eFue$x36w%R6LtI;^I#T@GhsOQ@)lZP7TOeMj2>_Y z5)USGc`qhjriX=_60Rm(G~woU(&LW)2_7x&%d5@Ww9ohXJm2Si+qZq+e}m)i(AQnP zU*I5IId|X-bNA4}^`p#&2<&4Tf#1oY6=togSaR~>OLsJ!6!ggg3@n^+k!5)8${9+> zV7%Rd5TncMU@~JC>vJ4A&a1npIa&RgrgVsL@h& zr0OQZnuhc?GE7} z07sOMWj7yC-fy`1SIPylw-pHGcihz*lOL7yf+87g{p#jK0ccoOG!#$dnB=AOlqN-K zDzoX#REil!lX*r{an_bz&HeK-gOp`5vpkVed**C`pPKU-=E&#rjOG)=#P%A;ZHVNg zbdl_rLq4{@$e=8qAJd&OE;(;2&5&2+%Orq9#K2zo*=}3MSGgBYh>7p`4(S>jbUYvV Qi9>V#ZKqp+lWqw90s*7Sod5s; delta 839 zcmX?clySxpM$URyA2%jOMhE7JobpUF;wEY>bF?vM00S7!AOPVr0x32ic8G%T9jZ|2 z0tlDkz+^#2aX}1KlPww56){AiX2A7sZe--v*gQq^hdv7vL+k#@3nEuA%1!o+>Sr^w z&@<38n7oitYVw5WfO-xqsC7_E7fiD=2-u>ETR_EcIG~Cedz$G_A#V2IKg}{tG5OF33naPH+;*vU9sM6Y?NM_<-aG0DJd!yda*@z)0F)1}C z-oVJh1jaNrH-j;q%q(C`b4R$Su?t+()Bvu{%+(R07Oud;3C?u1gfpGo41j7i!8SXa z!yVz`2sh2e*&Jq?i?h?_hjH#qOg83|8>2-h+b2qL!qevD#6)wU4bW7!AqwK#0M^Oc yiL#TA0~NxA#KBtsB`QsRkjODvBgumELkvW50n|{=$?Zv!T(EdCVFVe$$N&IV4T&iL diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtz_group.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtz_group.co index 6c2cb554b4bb83c4a7c81aec96ac42e510b760fa..3298c170aa3dd42545261ff3cf9997c922a61618 100755 GIT binary patch delta 1032 zcmZXTyKmD_6vpq3(l#y(Edqrgp`=uZ6a;achaDso@+62i0}=`0_S&(NfD>Cz$^b)P zh#1@IEZZbhQ*<5*>+2MhC*$Nh{8ct<)`i#p~J$~#~-6}ztEJFX{z7$EpQ%n0O-Qmm* zpTW^G6o;9U-@LxLrw2wq=li4K!PAq-;aSihg+cL`C+z_q0DcE&UIXGQZ^5HvyS(1x z*mKi66ppA`yDl3@$K6I7w`A1OP`81*$k1CIB;hvdtm(RDbf*dFA+9&u^=V&5HCbG% zqh?#tkfIp~t18kuNH;X4smh&5$B-hb`lxkwN*>5k*DOLxoE3N_DaN^=ps;dMN$?3- z#FBso%yaP?7NcCO#wO(z@(La!s}%u$Cod|I-c6aF78SwNIk0iw&2y9wF1Y!1%2zXP zewXqvcJmtLq1&b*>-U(RM%ixGowdHn92i`3^FGb9H{5)K^02krX}+YqamPWXd1G-7 z>{i{4ZL0xFlAC{^JWOEKPX?6*<_{$wNwn&RV{*CrqgRXp6^pWp*eIK|Z7vnJZM5K6 zE|aurEG5`BD`afT=NuXF`ces$a66;zeVX4xHk;JKOi65=Kufz delta 879 zcmexyjPb-#M$URyA2%jOMhE7Jobrqw6E&AR?lETo0~pO90O2zNDK;QJ5C!2Ms6wR+ zAY2BA$%2gHf*7hMTQaIEVu(V`fa~4d$hcpbF=n%)>S>nEB3eK6S(q4F_fJ-cTEQqc z`D9c-o1ulCfu6zSg^W^@1!62FpNJKh91){X-(dx{9ZFe%X?BJQwy5IHQ1J*yRB=D3 zxI!YTcqCL@A_!GH1uFi)3CyU6%jQERCb+|ezzm2-*%>taQN>%K>J`u|m;e<|2t!pr z8!G-F7Fpbyfnga`!XX)1f`MTpRNNyKReTRrTwwCXSaHcYS*Ws?pt5TwKa3S;JTUoZ z?74bZ7lxd~q|}^v149ERIMdY##x!y?fiaCO&0tI?3kw+2+!-!v>;@M#HG*q1H*iF# zg)6Xd0W Ym?(f6$~oCQS(57qRMeXhWCSAv0KK!F1ONa4 diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtna.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtna.co index fd751215503319144aead61bec10ef96965506c5..437e5f0fe1d883b85983021b79289aa783434379 100755 GIT binary patch delta 990 zcma)*&ui0Q7{}iP-BPEpp)mAd=RmCD)-+3#6vo=wtqlCNgP?-&?oIk8>6RuT$z*tm z6bc?Q-m3@oAm~M*;I;b`oL)S5@Ho8n=Fu#96Ua#ulIQ#VJikcbjlMBQ-Bj9 z6qq*ZW9U0DFi`+Kx9>v@+0fs2UB?UJ3T+P=J==)$7<&d!q#3)bb+ zVHcClZ6P1gY4hax+V#<6e3@mrCYk70No35P%a4;vJ{r>8!65yr12v7W$i$FKnfpX) Zrp#6}vTLp}Kgb8OS=z#B;Uu0|{tH+Fwc`K) delta 817 zcmX?bnQ_7;M$URyA2%jOMhE7JobpT)QYUIHtN&of00uCcK>)&M1X653{2>v-|Imm^ z7eKfS0(>wAgk}O##OQls6^O~%}tDY>YI0H=vcBaF|_WV{3CG!o1ulCfu6zS zg^W^@9g;cfbu6I{fl@qRnw`PK231@fDt-m(ahR+URQyLgOawyfLd6{dP{l2v;t>vD zMm=2C87h(B3Ks%1{Gj3izNq4nP;n153sRus4?zW{NWI2b-OPM(#rt=`bWi6JL3DK#hFz{tb}&U7<{F`bNEVN7#V z0~piT)C|Tnv9y3O%`8n|OmkBs7}LTWuE5a@ZkV$X$OK`q%`OIT6>d(BlP@Z&Ze~xD zWMcYYH+f;Y++>Fg0WNq7O`IH+VJ4W+1o2fu5`KR76o&6iNNROyB%~mgw17kc6g!S%I}&P}wCMz>14uBStL>cafD=1*3KdHX zQub80G87gTBvgfkDH$1yU|@q8AyFh&B#eEw6xG)G{qOtVy_0?C@n<;u0@-2f);0s- zM&^Jm!r4LjO(R$b(UnB76z4Du*>D6}oCh!FqZ_9nAbD8GpQ<8tcxB}jr9INQljo4T z1pAYt%TF^4xnObna&a`Bl*-rZ=dxflErIQ0W}N{H0H2CyUjuSq8xIBkMtComINRx& z*h8*5uzG42Bi}&-OC2E3?fOVpd&u8&UB?@Q75ZbyfKaq za6VosHUIoq%tCmo)b@vA3)rfLZNzU0N+K(bW+DY?E0NVoEs@PyI!%`OgjQuKkzzA# zCN;%`%CeA1r5+dL!<8FK+HF&m$hVbi;ov42<0pgx`E7D+`eTr?CFf7*5kJx5cUmHS hOJk}VM$URyA2%jOMhE7JobrqU6E&ARX4o@;0gPr4fbbcC6dMp{BtiHYO{jDM zgv&5vvLK_lAcm^RmW=9(7@|-!;CeSVGU_WcW^dl8QqHp3KvTz(g^8hc|Kx(C32cTI zdIow1lNT~dO*Y72V`gVan0zoO#c}5?~?_ z+5#%RBLG$087h9k0nDg}%lbhjZn(mQz>G+!_z7QB@f4`|0W=Hpq2dvtsOqbr;s&wE z;?@idtxyR8h>3972~hDF>2M)1;|qvn;$X;_ESS2p-hv?~F)1}C-oVh&62>$#bAmCA z4UJ(;CsSt_)7;Dz#xyoJfiX>-%wSA2CnFfs+}r@hw6KILaI}CJCJ40I*#vHci;>yn z`-<8?tiO3n+8aj3jL8c#^f}>aZSu_wbHN+XRC6N<;xUcMikaq|FrN72#7uLh4@r|} YW!f<`GOdG>w%3A+aDa9p}W3o%k1a%D|E^ zM9RRB>IT>l3=AxQGLS8djOmULBe3xwVC*|fR&Ckud*6G0wobnF4`%X{;l}OzD$6j9 zOJ`h>o!Do0-#0sL5Fa#xt4Yp8VI#UaW|DVi+2|)JSmHcW$Y1Ef2h7g)1xse6&n{jt zm7Uz?tQ7fx)s}g=Iljgpm#@cQZgi7k&T)riABZW^_o$YtGb4|3Q~|PN>AUzX za|^G{-1ygkEDZ@;aohybLDmEtBZl<$ zfcTu6su&&<|2k<%i#_7!$?Vj7#D9B~su-RpUIY!8EliUF_-bVRocIRL4+=~N?+(6< zPt+!`k*3>_7nBo?3~a%B*SO4{n{kF~on| OIyW08%kUQqPVpalleZ%P delta 752 zcmZp8z(k%TI)3l(n&L>0Gyibptr8TD{k zXQ%{&8(av?@Pmp^fH2^akx=mtGz(Io;s?UuO2LeLsJKNuvbZ$^LlsnFM;fvO14AoR z{6Q9~_ynl9#^i%p;*189Z)R<;H?(kK$Vp5}&51WKGI4=3-ArLjCl^;3)7;Sj#xyoH zgE38<;i6`)CNNQRM1!VRv#37D9mwmG|kn8IM2UED07!iI(>j*~Acs&01Akz@io zF)>e{6P~~(&&)FyTmVff3!up_Ve-Q~b50mfe6nJ`In$1e$x-=sOc_~|XXV>*-hc+f J$H@^dJkRfa=N%Hr<_yk%Kzi1_*P|e8 zZXMBi3K&N>-|c%{5L}UhOqiiB+y3p1!tA|FsvSnq#95fjp6kMOc%yKRk{R*d)eFd6 zqhNOR@%rIbIyJwvetmhdUKLAv_!i(dqveceZ~JSv$mo_i zR2*bW(~&6|3UVzp9Z6H<*oKSxl8M{{+qRsk-_f5)L(Lrebp;I+`Cy1NQ?-z4IY`oV zWVy(8ELGDLmvNmwqw9ypo=?U~e;R~cwZe*mT9Ye$UQ}77rdEZjB1?Ty5+#AJ3?!N3 z%LBHil<+a!!Je^wAq7tHLoGLZHX)yiNG{M6cwbNQtAu~LljM7ZKW-$sNcbN)$=ig3 zM}fi2n4aAy>}xYH@#87s^InVsGax+kAjvJl)8Xv6_?+l?EO@9)F&7fiB64j@T|IMwZ)eMV7C1B3tIFk!|(jZ994bd)~e aU#0-RX-?$SUH8~@@HBmWN&alf1Z delta 763 zcmZp8$avvCBWJyj9KFKODv%lLn(1E&CamK231@MD(+*CDy|C^|B!?# zZUGhl5r``83>ClN0A|#~W&NNM6>e}LFe4Hw&f$+Lo&ptT2tpOlhl*Q-ql#BS#W%zw zi(4}=v_d5e(vc+?7$!i)BeGG&zks+*91JTa2WIcAw_wOgOiIm(H!yUxgfWfGoM232 zLt_}z$;}zYGTb?;w47-^>&^S*sjdwIlQ9HNDjK3H@M+25bkQgj3%aUkk|HTZUT7)mBA;(@ zqJ9G(z{_}4r#)yo2?vjpTq1n!ULY`PFkY3g zt(8E;zx#y$4hv!~6W+WXqd>F>Uk_);NsI8#dW-_`Jmh!4=Z$1y7$!hwGs#DU11-sa zomh$nw2O}#AH7Po0!&5KO~frqtC8e|S}beDNS3NxB*hXR$$UYIq>zt?6;{QFiX|bE z#X?*l)lP=Z1e+^DJfg~R08cX4@RzIX_{GK%q-YMms$axjbM{Ppr|S5yDPGtmS8SJD rxxHz;-MoTR@+{txY4|qiE6Z7mZD3U{!|(V&US`(F0sbLl83_IY>ngX~ delta 838 zcmdn7m9b+BBWJyObf+fB}qV5Pjpl2|7 zA*0k}hY*f>9ZRS~pcD_7W@j+5K@}H=ieIrq6<30a|A<5t*M*8Z_@atiK*b{*z>Iph ztTR;N0fYe;_k)TDc)^9h42XT~3?66}q(Idt1fZ(Vhl=kALlzHVV5ovhNW>sZFfg=2 z#TDXE#V0_;UnHT5KLc@@I2b+@O`a9Ht=`bWi6JL3DK#hFz{tb}&U7<{F)f{3VN5ei z0~piT)C|TnF}8p)OeFs4+P(%ARvZC`8M9sM(y13;-WEgIWLp diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtz_group.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtz_group.co index 00d8f51b7a3ab4bb12a8ffa266ecd4200858c795..9bdf92bdb8f1ece96519a094a49a9e629a1cc766 100755 GIT binary patch delta 935 zcmZXTF>KR76o&8IlGI{pB&0|zX#t5OqK)Guwj-go2}uS7s8U0lR29c1vC~i|cI^}@ zmY5-Xs#_Qf3kwp80!U2B$XGTe#LNJNl?7v;Em=M5{Qm!Y@7_tLyU_cysvzr9J$$lNT^` z9gZjC8_(t!!q(;K9y^>);)z?yD+1rd0K5cf}@8eq9N+UFnF5}@@tg8za8W&lvme-T%!DVH^}c$ zKD+M-@~g@Y9#Hmm!x8?kPWj`Kj~vmW{OoR!Ym|R>d;7`flz-p!kt2pK=Yg~nOpM$F zc)1tkfBsu$!M;`a;io|vsJJ1kh%1yOPo@h+Px3;^lbLkRllhXrOek_5&4_|0vpK(6 zu9EerP~<%+if%!~-Z{_t-3o$0zAY#1xjJEXenM!G-?bF$KMazni;;7B#n0)pStN&b hk@eg=GOmlvX_fq_uQ8k3B)PlB&bHkn_)89Jv3~(#z5DcTw6W9zb z^bGV2CNE@^nrsls#>~!;F!^AJc>NMfsJ&21989w_tg%5ASAvTB*rST;Ld6TBP{l2v z;yZj%#hszz7aYKhdbq3~R3gF!E(B&oLd8#bp^B$K#SfrakPj8V5rC?`3My_8jx27? zz|abnm=J?3!N4#9Dn26)Rs0i(%f!KuFv!fZ@2uD}5$@dhs zfLL$yrto)+j2V*`M(T6I)7s>lk>-LopsD5tG$m?GR*W*|gz>~DCq|hweF&aBE6R>( VLFnXLQFfdHVUQsBJy|hQ9sn Date: Thu, 20 Nov 2025 17:12:39 +0800 Subject: [PATCH 04/10] tune a8w8_blockscale&bpreshuffle for tencent (#1444) * tune a8w8_blockscale&bpreshuffle for tencent Signed-off-by: LJ-underdog * update a8w8_bpreshuffle_tuned_gemm.csv Signed-off-by: LJ-underdog * Update a8w8_blockscale_bpreshuffle_tuned_gemm.csv * update aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv Signed-off-by: LJ-underdog * Update a8w8_blockscale_tuned_gemm.csv * updated a8w8_blockscale_tuned_gemm_ds_v3.csv&a8w8_bpreshuffle_tuned_gemm_dsv3.csv Signed-off-by: LJ-underdog * update aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv Signed-off-by: LJ-underdog --------- Signed-off-by: LJ-underdog --- .../a8w8_blockscale_tuned_gemm_ds_v3.csv | 577 ++++++++++++++++++ .../a8w8_bpreshuffle_tuned_gemm_dsv3.csv | 487 +++++++++++++++ 2 files changed, 1064 insertions(+) diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_ds_v3.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_ds_v3.csv index 4d3d45ab3a..5b18451ffc 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_ds_v3.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_ds_v3.csv @@ -66,3 +66,580 @@ cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio 256,16384,7168,2048,0,0,330.8182,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1454.08,855.8,0.0 256,20480,7168,2048,0,0,630.5854,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,953.55,555.4,0.0 256,32768,7168,2048,0,0,649.7976,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1480.57,848.8,0.0 +80,1,2112,7168,8,0,29.6812,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,24576,1536,8,0,21.7723,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,32768,512,8,0,11.9511,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,7168,16384,8,0,80.4411,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,36864,7168,8,0,126.2421,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,7168,18432,8,0,90.1639,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,128,7168,13,0,31.0068,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,1,8192,1536,8,0,10.8235,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,2240,7168,8,0,29.5828,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,32768,1536,8,0,28.4212,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,3072,1536,8,0,8.805,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,4096,512,8,0,4.2994,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,7168,2048,8,0,13.3935,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,4608,7168,8,0,30.2356,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,7168,2304,8,0,13.4935,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,11264,1536,8,0,14.1259,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,4096,7168,8,0,30.2807,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,512,7168,8,0,28.6767,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1,7168,256,6,0,4.389,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,2112,7168,8,0,29.936,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,24576,1536,8,0,22.0947,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,32768,512,8,0,13.3839,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,7168,16384,8,0,81.0783,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,36864,7168,8,0,127.8185,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,7168,18432,8,0,87.2351,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,128,7168,8,0,22.9607,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,8192,1536,8,0,11.0715,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,2240,7168,8,0,29.5412,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,32768,1536,8,0,28.4788,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,3072,1536,8,0,8.5502,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,4096,512,8,0,4.319,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,7168,2048,8,0,13.5635,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,4608,7168,18,0,35.42,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,2,7168,2304,8,0,13.5891,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,11264,1536,8,0,14.8831,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,4096,7168,8,0,30.4452,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,512,7168,8,0,28.3127,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2,7168,256,6,0,4.3638,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,2112,7168,8,0,30.8,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,24576,1536,8,0,22.3543,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,32768,512,8,0,11.9811,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,7168,16384,8,0,81.1975,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,36864,7168,8,0,129.7421,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,7168,18432,8,0,93.2707,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,128,7168,8,0,23.1411,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,8192,1536,8,0,11.1999,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,2240,7168,8,0,30.7308,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,32768,1536,8,0,28.9084,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,3072,1536,8,0,8.4646,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,4096,512,8,0,4.3014,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,7168,2048,8,0,13.6655,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,4608,7168,8,0,31.122,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,7168,2304,7,0,12.4283,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0,0,0 +80,4,11264,1536,8,0,13.9487,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,4096,7168,8,0,31.1835,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,512,7168,8,0,29.6291,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,4,7168,256,6,0,4.3806,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,2112,7168,8,0,31.0184,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,24576,1536,8,0,22.9327,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,32768,512,8,0,12.1631,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,7168,16384,8,0,84.4723,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,36864,7168,8,0,131.4905,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,7168,18432,8,0,93.7435,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,128,7168,8,0,23.3771,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,8192,1536,8,0,11.3803,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,2240,7168,8,0,31.1992,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,32768,1536,6,0,41.504,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,3072,1536,8,0,9.7794,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,4096,512,8,0,4.2922,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,7168,2048,8,0,13.9275,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,4608,7168,8,0,31.6788,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,7168,2304,7,0,12.4415,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0,0,0 +80,8,11264,1536,8,0,14.3103,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,4096,7168,8,0,31.6459,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,512,7168,8,0,30.1107,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,8,7168,256,8,0,4.2986,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,16,64,7168,8,0,22.9407,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,16,128,7168,8,0,22.9747,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,16,2112,7168,8,0,23.8827,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,16,7168,16384,7,0,81.0135,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0,0,0 +80,16,8192,1536,8,0,10.1315,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,16,32768,512,8,0,12.3079,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,16,24576,1536,8,0,20.9171,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,16,36864,7168,8,0,126.5277,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,16,7168,18432,7,0,83.8083,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0,0,0 +80,16,2240,7168,8,0,24.0495,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,16,32768,1536,8,0,27.3928,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,16,11264,1536,8,0,13.7603,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,16,4096,7168,8,0,29.3875,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,32,64,7168,8,0,22.2799,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,32,128,7168,8,0,22.8355,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,32,2112,7168,8,0,23.8695,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,32,7168,16384,12,0,90.6679,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,32,8192,1536,12,0,12.3951,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,32,32768,512,13,0,18.3123,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,32,24576,1536,13,0,32.0868,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,32,36864,7168,13,0,181.66,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,32,7168,18432,12,0,91.0487,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,32,2240,7168,8,0,24.0079,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,32,32768,1536,13,0,42.9012,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,32,11264,1536,8,0,19.3971,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,32,4096,7168,7,0,32.7,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,0,0,0 +80,64,64,7168,8,0,22.0451,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,64,128,7168,8,0,22.2911,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,64,2112,7168,8,0,30.6896,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,64,7168,16384,18,0,142.6302,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,64,8192,1536,18,0,16.5967,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,64,32768,512,18,0,24.9151,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,64,24576,1536,18,0,37.2172,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,64,36864,7168,18,0,204.0713,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,64,7168,18432,18,0,130.9413,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,64,2240,7168,8,0,30.7432,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,64,32768,1536,18,0,50.8693,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,64,11264,1536,18,0,23.2527,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,64,4096,7168,18,0,33.6092,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,2112,7168,18,0,36.1692,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,24576,1536,2,0,71.0526,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,96,32768,512,10,0,39.2468,a8w8_blockscale_1x128x128_256x32x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,7168,16384,12,0,226.0095,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,36864,7168,12,0,375.3626,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,7168,18432,18,0,211.557,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,128,7168,8,0,19.2111,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,96,8192,1536,2,0,29.3651,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,96,2240,7168,18,0,36.3512,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,32768,1536,3,0,90.8503,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,96,3072,1536,12,0,12.2499,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,4096,512,18,0,9.8511,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,7168,2048,18,0,30.9572,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,4608,7168,18,0,54.5221,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,7168,2304,3,0,31.2932,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,96,11264,1536,18,0,38.5268,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,4096,7168,10,0,70.1189,a8w8_blockscale_1x128x128_256x32x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,96,512,7168,8,0,22.4563,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,96,7168,256,6,0,9.3194,a8w8_blockscale_1x128x128_256x16x64x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,128,64,7168,8,0,21.7027,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,128,128,7168,8,0,21.7803,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,128,2112,7168,18,0,32.8696,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,128,7168,16384,18,0,187.9301,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,128,8192,1536,18,0,29.2096,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,128,32768,512,16,0,41.9332,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,128,24576,1536,0,0,71.0226,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,128,36864,7168,0,0,355.7861,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,128,7168,18432,18,0,189.0192,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,128,2240,7168,18,0,32.9736,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,128,32768,1536,0,0,90.0163,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,128,11264,1536,18,0,36.5868,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,128,4096,7168,18,0,51.234,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,160,2112,7168,18,0,53.5813,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,160,24576,1536,3,0,100.982,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,160,32768,512,3,0,59.4173,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,160,7168,16384,18,0,293.8983,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,160,36864,7168,18,0,559.152,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,160,7168,18432,2,0,379.9103,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,160,128,7168,8,0,19.6555,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,160,8192,1536,3,0,38.2228,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,160,2240,7168,18,0,54.0881,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,160,32768,1536,2,0,130.1209,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,160,3072,1536,18,0,16.9339,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,160,4096,512,11,0,11.1599,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,160,7168,2048,18,0,47.0101,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,160,4608,7168,3,0,81.3331,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,160,7168,2304,10,0,45.9093,a8w8_blockscale_1x128x128_256x32x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,160,11264,1536,3,0,50.0469,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,160,4096,7168,12,0,71.1633,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,160,512,7168,8,0,28.3099,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,160,7168,256,11,0,10.8494,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,192,2112,7168,18,0,49.1485,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,24576,1536,18,0,102.1488,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,32768,512,16,0,58.0269,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,7168,16384,12,0,381.2924,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,36864,7168,18,0,513.2381,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,7168,18432,18,0,293.6094,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,128,7168,8,0,19.7799,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,192,8192,1536,16,0,38.6744,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,2240,7168,18,0,49.4897,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,32768,1536,2,0,132.3045,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,192,3072,1536,18,0,16.5627,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,4096,512,18,0,12.7227,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,7168,2048,18,0,44.5893,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,4608,7168,18,0,78.2074,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,7168,2304,3,0,42.1737,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,192,11264,1536,18,0,49.4729,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,4096,7168,18,0,78.657,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,192,512,7168,8,0,26.9995,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,192,7168,256,16,0,12.7423,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,224,2112,7168,18,0,54.1121,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,224,24576,1536,0,0,128.7353,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,224,32768,512,16,0,74.8462,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,224,7168,16384,18,0,349.3654,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,224,36864,7168,2,0,725.3164,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,224,7168,18432,18,0,381.1539,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,224,128,7168,8,0,20.1755,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,224,8192,1536,3,0,48.1293,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,224,2240,7168,18,0,54.4329,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,224,32768,1536,2,0,167.1315,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,224,3072,1536,3,0,23.8183,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,224,4096,512,10,0,14.9871,a8w8_blockscale_1x128x128_256x32x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,224,7168,2048,2,0,55.3617,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,224,4608,7168,2,0,103.4296,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,224,7168,2304,3,0,53.5745,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,224,11264,1536,3,0,62.5822,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,224,4096,7168,2,0,100.7899,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,224,512,7168,8,0,28.0503,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,224,7168,256,11,0,13.9371,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,256,64,7168,8,0,20.9311,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,256,128,7168,8,0,21.0991,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,256,2112,7168,18,0,49.4557,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,256,7168,16384,18,0,343.3197,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,256,8192,1536,0,0,45.4237,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,256,32768,512,16,0,74.2042,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,256,24576,1536,0,0,108.7836,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,256,36864,7168,0,0,606.9262,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,256,7168,18432,18,0,342.1461,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,256,2240,7168,18,0,49.7517,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,256,32768,1536,0,0,151.9194,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,256,11264,1536,18,0,62.9586,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,256,4096,7168,0,0,107.5719,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,288,2112,7168,13,0,81.8363,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,288,24576,1536,3,0,158.2255,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,288,32768,512,16,0,91.3867,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,288,7168,16384,2,0,521.2036,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,288,36864,7168,2,0,976.0007,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,288,7168,18432,18,0,450.2698,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,288,128,7168,8,0,20.9287,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,288,8192,1536,2,0,56.7309,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,288,2240,7168,13,0,82.4603,a8w8_blockscale_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,288,32768,1536,3,0,209.4934,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,288,3072,1536,3,0,25.3575,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,288,4096,512,16,0,14.6299,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,288,7168,2048,18,0,65.1418,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,288,4608,7168,18,0,134.2474,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,288,7168,2304,3,0,63.8426,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,288,11264,1536,3,0,76.3027,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,288,4096,7168,18,0,106.7871,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,288,512,7168,8,0,28.7647,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,288,7168,256,11,0,16.8451,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,320,2112,7168,18,0,79.0835,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,24576,1536,18,0,163.7415,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,32768,512,16,0,91.3327,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,7168,16384,18,0,440.8611,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,36864,7168,18,0,865.4396,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,7168,18432,2,0,552.8448,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,320,128,7168,8,0,21.6103,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,320,8192,1536,18,0,55.9177,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,2240,7168,18,0,77.7226,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,32768,1536,2,0,220.1254,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,320,3072,1536,18,0,23.6307,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,4096,512,3,0,13.9711,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,320,7168,2048,18,0,63.8546,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,4608,7168,18,0,126.6017,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,7168,2304,18,0,63.521,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,11264,1536,2,0,81.5799,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,320,4096,7168,18,0,99.8999,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,320,512,7168,8,0,29.4567,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,320,7168,256,16,0,16.5307,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,2112,7168,18,0,86.7303,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,24576,1536,3,0,185.6181,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,352,32768,512,16,0,107.482,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,7168,16384,18,0,528.4344,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,36864,7168,18,0,1107.0329,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,7168,18432,18,0,564.1177,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,128,7168,8,0,21.4607,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,352,8192,1536,3,0,66.795,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,352,2240,7168,18,0,85.4167,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,32768,1536,0,0,240.9319,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,352,3072,1536,2,0,29.2036,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,352,4096,512,16,0,19.1615,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,7168,2048,18,0,81.6351,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,4608,7168,18,0,156.3831,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,7168,2304,3,0,77.3487,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,352,11264,1536,1,0,90.9328,a8w8_blockscale_1x128x128_256x128x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,352,4096,7168,18,0,135.5164,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,512,7168,18,0,36.2172,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,352,7168,256,11,0,19.9235,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,384,2112,7168,18,0,77.6818,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,384,24576,1536,0,0,160.4975,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,384,32768,512,16,0,106.8048,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,384,7168,16384,18,0,510.7671,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,384,36864,7168,0,0,823.1322,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,384,7168,18432,18,0,520.9614,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,384,128,7168,8,0,21.6103,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,384,8192,1536,0,0,66.5862,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,384,2240,7168,18,0,79.7882,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,384,32768,1536,0,0,200.5321,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,384,3072,1536,2,0,30.3712,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,384,4096,512,16,0,18.5131,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,384,7168,2048,18,0,76.1686,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,384,4608,7168,0,0,138.3706,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,384,7168,2304,3,0,79.7931,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,384,11264,1536,0,0,86.1339,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,384,4096,7168,2,0,157.4221,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,384,512,7168,18,0,32.4251,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,384,7168,256,16,0,19.8795,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,416,2112,7168,18,0,85.7363,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,416,24576,1536,2,0,214.7142,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,416,32768,512,16,0,124.5641,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,416,7168,16384,2,0,638.6827,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,416,36864,7168,18,0,1289.8574,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,416,7168,18432,18,0,625.0472,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,416,128,7168,8,0,21.8407,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,416,8192,1536,3,0,79.0346,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,416,2240,7168,2,0,100.4724,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,416,32768,1536,2,0,286.0878,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,416,3072,1536,3,0,36.8592,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,416,4096,512,16,0,21.6743,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,416,7168,2048,2,0,86.7631,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,416,4608,7168,2,0,195.7449,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,416,7168,2304,3,0,86.4847,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,416,11264,1536,16,0,110.7437,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,416,4096,7168,2,0,162.395,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,416,512,7168,12,0,36.4728,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,416,7168,256,11,0,22.6419,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,448,2112,7168,18,0,78.2418,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,448,24576,1536,2,0,215.2938,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,448,32768,512,16,0,122.9093,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,448,7168,16384,18,0,568.7454,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,448,36864,7168,2,0,1317.354,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,448,7168,18432,18,0,571.0165,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,448,128,7168,8,0,21.8071,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,448,8192,1536,2,0,77.745,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,448,2240,7168,18,0,94.1239,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,448,32768,1536,2,0,285.5158,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,448,3072,1536,18,0,36.0088,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,448,4096,512,16,0,21.0995,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,448,7168,2048,18,0,84.8563,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,448,4608,7168,18,0,167.8608,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,448,7168,2304,2,0,79.8603,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,448,11264,1536,2,0,103.8016,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,448,4096,7168,18,0,143.2833,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,448,512,7168,18,0,32.4547,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,448,7168,256,16,0,21.6627,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,480,2112,7168,2,0,101.052,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,24576,1536,3,0,236.2091,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,32768,512,3,0,140.615,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,7168,16384,2,0,685.2973,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,36864,7168,2,0,1375.2819,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,7168,18432,2,0,722.8485,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,128,7168,8,0,21.7087,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,480,8192,1536,3,0,83.5863,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,2240,7168,2,0,100.3492,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,32768,1536,0,0,310.0055,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,3072,1536,3,0,37.096,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,4096,512,16,0,22.2995,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,480,7168,2048,2,0,97.8496,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,4608,7168,2,0,197.4349,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,7168,2304,3,0,98.382,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,11264,1536,3,0,112.3881,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,4096,7168,2,0,193.8503,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,480,512,7168,18,0,36.4788,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,480,7168,256,11,0,25.0591,a8w8_blockscale_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,0,0,0 +80,512,64,7168,8,0,21.4855,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,512,128,7168,8,0,21.8127,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,512,2112,7168,18,0,95.2955,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,512,7168,16384,0,0,616.2237,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,512,8192,1536,0,0,82.2627,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,512,32768,512,16,0,138.6678,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,512,24576,1536,0,0,199.0229,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,512,36864,7168,0,0,1092.6052,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,512,7168,18432,0,0,587.1778,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,512,2240,7168,18,0,96.0611,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,512,32768,1536,0,0,258.5844,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,512,11264,1536,0,0,99.7424,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,512,4096,7168,0,0,145.8489,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,1024,64,7168,8,0,21.7759,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1024,128,7168,8,0,29.8,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,1024,2112,7168,18,0,167.8663,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,1024,7168,16384,0,0,1140.981,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,1024,8192,1536,0,0,141.623,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,1024,32768,512,16,0,263.8865,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,1024,24576,1536,0,0,390.4668,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,1024,36864,7168,0,0,2089.0389,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,1024,7168,18432,0,0,1164.9336,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,1024,2240,7168,18,0,171.2028,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,1024,32768,1536,0,0,502.3442,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,1024,11264,1536,3,0,212.6951,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,1024,4096,7168,0,0,293.2092,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,2048,64,7168,8,0,31.3536,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0,0,0 +80,2048,128,7168,18,0,32.7228,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,2048,2112,7168,18,0,322.7936,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,2048,7168,16384,0,0,2211.9911,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,2048,8192,1536,0,0,252.5388,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,2048,32768,512,16,0,513.9487,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,2048,24576,1536,0,0,740.166,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,2048,36864,7168,0,0,4009.9839,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,2048,7168,18432,0,0,2123.1521,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,2048,2240,7168,18,0,330.7757,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,2048,32768,1536,0,0,982.4949,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,2048,11264,1536,0,0,347.4495,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,2048,4096,7168,0,0,506.1934,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,4096,64,7168,18,0,32.8716,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,4096,128,7168,18,0,54.2453,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,4096,2112,7168,18,0,621.3309,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,4096,7168,16384,0,0,4302.3785,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,4096,8192,1536,0,0,492.553,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,4096,32768,512,16,0,1016.0847,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,4096,24576,1536,0,0,1448.9153,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,4096,36864,7168,0,0,7895.1562,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,4096,7168,18432,0,0,4033.3849,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,4096,2240,7168,18,0,643.893,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,4096,32768,1536,0,0,1930.1166,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,4096,11264,1536,0,0,729.0026,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,4096,4096,7168,0,0,907.0889,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,2112,7168,18,0,913.7307,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,6144,24576,1536,3,0,2533.8727,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,32768,512,16,0,1515.4856,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,6144,7168,16384,0,0,6608.4842,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,36864,7168,0,0,12073.6156,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,7168,18432,0,0,6181.2122,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,128,7168,18,0,80.2986,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,6144,8192,1536,0,0,819.0652,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,2240,7168,18,0,963.2848,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,6144,32768,1536,0,0,2916.9733,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,3072,1536,16,0,330.8397,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,6144,4096,512,16,0,196.9929,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,6144,7168,2048,0,0,809.0744,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,4608,7168,0,0,1526.3692,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,7168,2304,0,0,851.0657,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,11264,1536,0,0,996.9846,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,4096,7168,0,0,1377.5484,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,512,7168,0,0,232.7081,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,6144,7168,256,16,0,229.0933,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,8192,64,7168,18,0,57.1329,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,8192,128,7168,18,0,101.6448,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,8192,2112,7168,18,0,1213.0298,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,8192,7168,16384,0,0,8622.1052,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,8192,8192,1536,0,0,974.0173,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,8192,32768,512,16,0,2016.3824,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,8192,24576,1536,0,0,2886.1359,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,8192,36864,7168,0,0,15680.1116,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,8192,7168,18432,0,0,8118.7627,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,8192,2240,7168,18,0,1280.2434,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,8192,32768,1536,3,0,4534.0043,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,8192,11264,1536,0,0,1327.773,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,8192,4096,7168,0,0,1793.3575,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,2112,7168,18,0,1515.2969,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,10240,24576,1536,0,0,3651.431,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,32768,512,16,0,2515.5457,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,10240,7168,16384,0,0,11023.5824,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,36864,7168,0,0,19995.2839,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,7168,18432,0,0,10175.6663,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,128,7168,18,0,105.886,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,10240,8192,1536,3,0,1430.6685,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,2240,7168,18,0,1636.782,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,10240,32768,1536,0,0,4837.2435,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,3072,1536,16,0,547.2561,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,10240,4096,512,16,0,322.5837,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,10240,7168,2048,0,0,1327.4494,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,4608,7168,0,0,2530.3997,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,7168,2304,0,0,1388.6326,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,11264,1536,0,0,1650.8577,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,4096,7168,0,0,2250.9998,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,512,7168,0,0,315.2897,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,10240,7168,256,16,0,368.4904,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,12288,2112,7168,18,0,1828.9839,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,12288,24576,1536,0,0,4305.7272,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,32768,512,16,0,3016.8385,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,12288,7168,16384,0,0,12947.739,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,36864,7168,0,0,23625.3537,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,7168,18432,0,0,12232.127,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,128,7168,18,0,130.3021,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,12288,8192,1536,0,0,1449.011,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,2240,7168,2,0,2046.9716,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,32768,1536,0,0,5785.9662,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,3072,1536,0,0,554.6374,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,4096,512,3,0,388.9976,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,7168,2048,0,0,1607.9242,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,4608,7168,0,0,2993.2517,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,7168,2304,0,0,1685.1739,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,11264,1536,3,0,2345.0071,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,4096,7168,0,0,2718.6056,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,512,7168,0,0,379.5964,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,12288,7168,256,16,0,442.8496,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,14336,2112,7168,18,0,2116.3028,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,14336,24576,1536,3,0,5951.0038,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,32768,512,3,0,3544.876,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,7168,16384,0,0,15187.1505,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,36864,7168,0,0,27944.1037,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,7168,18432,0,0,14287.2359,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,128,7168,18,0,154.4819,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,14336,8192,1536,3,0,1969.4415,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,2240,7168,18,0,2246.3365,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,14336,3072,1536,3,0,757.7926,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,4096,512,16,0,451.8624,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,14336,7168,2048,0,0,1856.2897,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,4608,7168,0,0,3576.6846,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,7168,2304,0,0,1951.3416,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,11264,1536,0,0,2380.0822,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,32768,1536,0,0,6695.5133,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,4096,7168,0,0,3229.5757,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,512,7168,0,0,448.1751,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,14336,7168,256,16,0,524.9808,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,16384,64,7168,18,0,109.3048,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,16384,128,7168,0,0,178.0376,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,16384,2112,7168,18,0,2419.3022,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,16384,7168,16384,0,0,17329.3114,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,16384,8192,1536,0,0,1921.1494,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,16384,32768,512,16,0,4019.012,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,16384,24576,1536,3,0,6732.7083,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,16384,36864,7168,0,0,31524.9476,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,16384,7168,18432,0,0,16294.5139,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,16384,2240,7168,18,0,2564.4367,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,16384,11264,1536,3,0,3104.6685,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,16384,32768,1536,3,0,8991.3448,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,16384,4096,7168,0,0,3558.282,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,64,7168,18,0,190.0733,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,32768,128,7168,18,0,326.7444,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,32768,2112,7168,18,0,4824.1677,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,32768,7168,16384,0,0,34852.366,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,8192,1536,0,0,3815.3061,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,32768,512,3,0,8065.9904,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,24576,1536,3,0,13388.3528,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,36864,7168,0,0,62743.4272,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,7168,18432,0,0,32659.3895,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,2240,7168,18,0,5242.0726,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,32768,3072,1536,0,0,1461.7662,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,4096,512,16,0,1024.1373,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,32768,4608,7168,2,0,10392.1318,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,7168,2304,0,0,4884.5134,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,11264,1536,0,0,5271.9454,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,7168,2048,0,0,4614.1608,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,32768,1536,0,0,15429.1559,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,576,7168,18,0,1350.8663,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,32768,1536,7168,0,0,2845.5978,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,4096,7168,0,0,7182.5926,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,512,7168,0,0,963.846,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,32768,7168,256,16,0,1261.8447,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,65536,64,7168,18,0,350.8382,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,65536,128,7168,0,0,608.956,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,2112,7168,18,0,9617.2811,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,65536,7168,16384,0,0,70478.0846,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,8192,1536,0,0,7621.6687,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,24576,1536,0,0,22949.6148,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,7168,18432,0,0,65026.138,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,2240,7168,2,0,10800.6316,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,3072,1536,3,0,3441.6001,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,4096,512,16,0,2019.1043,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,65536,4608,7168,0,0,15930.1957,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,7168,2304,3,0,10515.839,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,11264,1536,3,0,12267.214,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,7168,2048,0,0,8385.3458,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,1536,7168,0,0,5455.1913,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,36864,7168,0,0,127114.4032,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,576,7168,18,0,2678.1872,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,65536,4096,7168,0,0,14374.4254,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,512,7168,0,0,1946.2413,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,65536,7168,256,16,0,2331.7635,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,98304,2112,7168,2,0,14838.37,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,7168,16384,0,0,105519.4606,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,128,7168,0,0,869.1726,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,8192,1536,0,0,11517.8223,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,2240,7168,18,0,15696.2548,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,98304,3072,1536,0,0,4372.9543,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,4096,512,3,0,3142.5369,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,4608,7168,2,0,31224.8208,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,7168,2304,3,0,15786.6506,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,11264,1536,0,0,15928.1433,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,7168,2048,0,0,12615.4473,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,7168,18432,0,0,99003.4516,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,1536,7168,0,0,8187.0758,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,576,7168,2,0,4586.9954,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,4096,7168,0,0,21200.0175,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,512,7168,0,0,2849.7983,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,98304,7168,256,16,0,3479.2173,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,131072,2112,7168,18,0,19268.6081,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,131072,7168,16384,18,0,154106.4002,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,131072,128,7168,0,0,1131.8661,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,131072,8192,1536,0,0,15358.2359,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,131072,2240,7168,18,0,20374.7532,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,131072,3072,1536,3,0,6873.7457,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,131072,4096,512,16,0,4029.3703,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,131072,7168,2048,0,0,17225.8363,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,131072,7168,18432,0,0,131465.7679,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,131072,1536,7168,0,0,10881.6908,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,131072,576,7168,18,0,5376.424,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 +80,131072,4096,7168,0,0,28332.6632,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,131072,512,7168,0,0,3803.5469,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,0,0,0 +80,131072,7168,256,16,0,4668.3753,a8w8_blockscale_1x128x128_256x64x64x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v1,0,0,0 diff --git a/aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv b/aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv index 5496aa90d8..0461b68501 100644 --- a/aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv +++ b/aiter/configs/model_configs/a8w8_bpreshuffle_tuned_gemm_dsv3.csv @@ -316,3 +316,490 @@ cu_num,M,N,K,q_dtype_w,kernelId,splitK,us,kernelName,tflops,bw,errRatio 80,131072,2112,7168,torch.float8_e4m3fnuz,93,0,10093.2233,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3931.90,149.44,0 80,131072,2240,7168,torch.float8_e4m3fnuz,69,0,12728.0402,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,3306.93,121.21,0 80,131072,11264,1536,torch.float8_e4m3fnuz,71,0,12851.8716,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,3529.05,246.77,0 +80,1,128,7168,torch.float8_e4m3fnuz,25,0,10.735,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0,0,0 +80,1,576,7168,torch.float8_e4m3fnuz,10,0,11.2398,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,1,1536,7168,torch.float8_e4m3fnuz,10,0,11.721,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,1,3072,1536,torch.float8_e4m3fnuz,5,0,5.6562,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,1,4096,7168,torch.float8_e4m3fnuz,11,0,13.4343,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,1,7168,2048,torch.float8_e4m3fnuz,10,0,9.1526,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,1,7168,16384,torch.float8_e4m3fnuz,24,0,39.678,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0,0,0 +80,1,7168,18432,torch.float8_e4m3fnuz,10,0,42.556,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,1,8192,1536,torch.float8_e4m3fnuz,11,0,7.6538,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,1,24576,1536,torch.float8_e4m3fnuz,15,0,13.5093,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,1,32768,512,torch.float8_e4m3fnuz,9,0,9.7066,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,1,32768,1536,torch.float8_e4m3fnuz,15,0,17.5611,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,1,36864,7168,torch.float8_e4m3fnuz,6,0,76.2189,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,2,128,7168,torch.float8_e4m3fnuz,10,0,10.1434,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,512,7168,torch.float8_e4m3fnuz,10,0,11.2907,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,576,7168,torch.float8_e4m3fnuz,10,0,11.5102,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,1536,7168,torch.float8_e4m3fnuz,10,0,12.1679,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,2240,7168,torch.float8_e4m3fnuz,10,0,12.1906,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,3072,1536,torch.float8_e4m3fnuz,11,0,5.911,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,4096,512,torch.float8_e4m3fnuz,23,0,4.4526,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0,0,0 +80,2,4096,7168,torch.float8_e4m3fnuz,11,0,13.2939,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,4608,7168,torch.float8_e4m3fnuz,5,0,13.597,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,7168,256,torch.float8_e4m3fnuz,75,0,6.3266,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0,0,0 +80,2,7168,2048,torch.float8_e4m3fnuz,5,0,8.5166,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,7168,2304,torch.float8_e4m3fnuz,29,0,9.905,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0,0,0 +80,2,7168,16384,torch.float8_e4m3fnuz,10,0,37.3783,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,7168,18432,torch.float8_e4m3fnuz,10,0,42.2924,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,8192,1536,torch.float8_e4m3fnuz,5,0,7.8006,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,11264,1536,torch.float8_e4m3fnuz,15,0,9.4898,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,24576,1536,torch.float8_e4m3fnuz,108,0,13.6777,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,32768,512,torch.float8_e4m3fnuz,9,0,9.513,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,32768,1536,torch.float8_e4m3fnuz,108,0,17.4059,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,2,36864,7168,torch.float8_e4m3fnuz,20,0,78.3165,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,0,0,0 +80,4,128,7168,torch.float8_e4m3fnuz,10,0,10.0606,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,512,7168,torch.float8_e4m3fnuz,10,0,11.6471,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,576,7168,torch.float8_e4m3fnuz,10,0,11.6411,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,1536,7168,torch.float8_e4m3fnuz,10,0,12.2579,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,2240,7168,torch.float8_e4m3fnuz,10,0,12.3558,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,3072,1536,torch.float8_e4m3fnuz,11,0,5.8834,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,4096,512,torch.float8_e4m3fnuz,23,0,4.5342,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0,0,0 +80,4,4096,7168,torch.float8_e4m3fnuz,5,0,13.5363,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,4608,7168,torch.float8_e4m3fnuz,5,0,13.6563,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,7168,256,torch.float8_e4m3fnuz,73,0,6.9558,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,4,7168,2048,torch.float8_e4m3fnuz,108,0,8.5722,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,7168,2304,torch.float8_e4m3fnuz,108,0,9.9551,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,7168,16384,torch.float8_e4m3fnuz,10,0,37.6428,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,7168,18432,torch.float8_e4m3fnuz,10,0,42.1052,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,8192,1536,torch.float8_e4m3fnuz,6,0,8.0606,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,4,11264,1536,torch.float8_e4m3fnuz,10,0,9.667,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,24576,1536,torch.float8_e4m3fnuz,15,0,13.9333,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,4,32768,512,torch.float8_e4m3fnuz,23,0,9.1702,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0,0,0 +80,4,32768,1536,torch.float8_e4m3fnuz,109,0,18.3707,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,4,36864,7168,torch.float8_e4m3fnuz,32,0,78.8657,a8w8_bpreshuffle_256x16x512x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v2,0,0,0 +80,8,128,7168,torch.float8_e4m3fnuz,10,0,10.4198,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,512,7168,torch.float8_e4m3fnuz,10,0,11.7575,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,576,7168,torch.float8_e4m3fnuz,10,0,11.871,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,1536,7168,torch.float8_e4m3fnuz,10,0,12.559,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,2240,7168,torch.float8_e4m3fnuz,10,0,12.6438,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,3072,1536,torch.float8_e4m3fnuz,10,0,5.8254,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,4096,512,torch.float8_e4m3fnuz,9,0,4.1858,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,4096,7168,torch.float8_e4m3fnuz,11,0,13.6103,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,4608,7168,torch.float8_e4m3fnuz,5,0,13.8099,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,7168,256,torch.float8_e4m3fnuz,75,0,6.2842,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0,0,0 +80,8,7168,2048,torch.float8_e4m3fnuz,24,0,8.685,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0,0,0 +80,8,7168,2304,torch.float8_e4m3fnuz,108,0,10.0038,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,7168,16384,torch.float8_e4m3fnuz,10,0,39.2255,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,7168,18432,torch.float8_e4m3fnuz,24,0,45.0508,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0,0,0 +80,8,8192,1536,torch.float8_e4m3fnuz,5,0,8.1558,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,11264,1536,torch.float8_e4m3fnuz,10,0,10.0366,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,24576,1536,torch.float8_e4m3fnuz,9,0,14.5085,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,32768,512,torch.float8_e4m3fnuz,9,0,9.9258,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,32768,1536,torch.float8_e4m3fnuz,5,0,18.2531,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,8,36864,7168,torch.float8_e4m3fnuz,111,0,79.5469,a8w8_bpreshuffle_256x16x512x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,16,128,7168,torch.float8_e4m3fnuz,19,0,10.0946,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0,0,0 +80,16,4096,7168,torch.float8_e4m3fnuz,11,0,13.1683,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,16,7168,16384,torch.float8_e4m3fnuz,6,0,42.1056,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,16,7168,18432,torch.float8_e4m3fnuz,6,0,45.8424,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,16,8192,1536,torch.float8_e4m3fnuz,6,0,7.9894,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,16,24576,1536,torch.float8_e4m3fnuz,5,0,15.7929,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,16,32768,512,torch.float8_e4m3fnuz,9,0,10.2342,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,16,32768,1536,torch.float8_e4m3fnuz,5,0,19.2975,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,16,36864,7168,torch.float8_e4m3fnuz,6,0,80.7001,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,32,128,7168,torch.float8_e4m3fnuz,10,0,10.0298,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,32,4096,7168,torch.float8_e4m3fnuz,12,0,18.5983,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,0,0,0 +80,32,7168,18432,torch.float8_e4m3fnuz,119,0,58.5048,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,32,8192,1536,torch.float8_e4m3fnuz,112,0,10.3018,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,32,32768,512,torch.float8_e4m3fnuz,76,0,12.0918,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,32,32768,1536,torch.float8_e4m3fnuz,119,0,24.3999,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,32,36864,7168,torch.float8_e4m3fnuz,133,0,93.517,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,64,128,7168,torch.float8_e4m3fnuz,24,0,9.9198,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0,0,0 +80,64,4096,7168,torch.float8_e4m3fnuz,114,0,23.8807,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,64,7168,18432,torch.float8_e4m3fnuz,121,0,89.7754,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,64,8192,1536,torch.float8_e4m3fnuz,114,0,12.7978,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,64,32768,512,torch.float8_e4m3fnuz,85,0,18.7478,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,64,32768,1536,torch.float8_e4m3fnuz,101,0,34.2699,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,64,36864,7168,torch.float8_e4m3fnuz,121,0,129.978,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,96,128,7168,torch.float8_e4m3fnuz,25,0,10.3566,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0,0,0 +80,96,576,7168,torch.float8_e4m3fnuz,10,0,12.065,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,96,1536,7168,torch.float8_e4m3fnuz,6,0,18.0262,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,96,4096,7168,torch.float8_e4m3fnuz,120,0,29.6988,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,96,7168,18432,torch.float8_e4m3fnuz,113,0,109.8187,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,96,8192,1536,torch.float8_e4m3fnuz,120,0,16.253,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,96,32768,512,torch.float8_e4m3fnuz,84,0,23.8023,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,96,32768,1536,torch.float8_e4m3fnuz,102,0,43.5312,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,96,36864,7168,torch.float8_e4m3fnuz,102,0,155.5081,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,128,128,7168,torch.float8_e4m3fnuz,10,0,9.909,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,128,4096,7168,torch.float8_e4m3fnuz,121,0,38.8588,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,128,7168,18432,torch.float8_e4m3fnuz,114,0,133.6804,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,128,8192,1536,torch.float8_e4m3fnuz,121,0,19.1559,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,128,32768,512,torch.float8_e4m3fnuz,85,0,28.3083,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,128,32768,1536,torch.float8_e4m3fnuz,85,0,54.7332,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,128,36864,7168,torch.float8_e4m3fnuz,93,0,201.1007,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,160,128,7168,torch.float8_e4m3fnuz,10,0,10.2346,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,160,512,7168,torch.float8_e4m3fnuz,25,0,12.1467,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0,0,0 +80,160,576,7168,torch.float8_e4m3fnuz,10,0,16.0783,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,160,1536,7168,torch.float8_e4m3fnuz,119,0,24.4126,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,160,2240,7168,torch.float8_e4m3fnuz,115,0,28.9759,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0,0,0 +80,160,4096,512,torch.float8_e4m3fnuz,84,0,7.7118,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,160,4096,7168,torch.float8_e4m3fnuz,119,0,40.6369,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,160,4608,7168,torch.float8_e4m3fnuz,122,0,46.244,a8w8_bpreshuffle_256x80x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,160,7168,256,torch.float8_e4m3fnuz,75,0,10.251,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0,0,0 +80,160,7168,2304,torch.float8_e4m3fnuz,119,0,27.2747,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,160,7168,18432,torch.float8_e4m3fnuz,136,0,168.4653,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,160,8192,1536,torch.float8_e4m3fnuz,100,0,21.719,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,160,11264,1536,torch.float8_e4m3fnuz,100,0,30.1239,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,160,32768,512,torch.float8_e4m3fnuz,84,0,34.3591,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,160,32768,1536,torch.float8_e4m3fnuz,100,0,73.0357,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,160,36864,7168,torch.float8_e4m3fnuz,156,0,267.3258,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,192,128,7168,torch.float8_e4m3fnuz,10,0,10.2034,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,192,512,7168,torch.float8_e4m3fnuz,10,0,16.0911,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,192,576,7168,torch.float8_e4m3fnuz,10,0,17.0079,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,192,1536,7168,torch.float8_e4m3fnuz,114,0,24.0194,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,192,4096,512,torch.float8_e4m3fnuz,84,0,9.9422,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,192,4096,7168,torch.float8_e4m3fnuz,123,0,49.1457,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,192,4608,7168,torch.float8_e4m3fnuz,128,0,47.9264,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,192,7168,256,torch.float8_e4m3fnuz,73,0,10.5659,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,192,7168,2304,torch.float8_e4m3fnuz,120,0,29.5015,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,192,7168,18432,torch.float8_e4m3fnuz,120,0,181.5802,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,192,8192,1536,torch.float8_e4m3fnuz,86,0,24.7435,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,192,32768,512,torch.float8_e4m3fnuz,85,0,37.0759,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,192,32768,1536,torch.float8_e4m3fnuz,85,0,73.3389,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,192,36864,7168,torch.float8_e4m3fnuz,94,0,286.0543,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,224,128,7168,torch.float8_e4m3fnuz,24,0,10.2858,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0,0,0 +80,224,512,7168,torch.float8_e4m3fnuz,19,0,17.1051,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0,0,0 +80,224,576,7168,torch.float8_e4m3fnuz,25,0,17.1379,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,0,0,0 +80,224,1536,7168,torch.float8_e4m3fnuz,115,0,31.7078,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0,0,0 +80,224,2240,7168,torch.float8_e4m3fnuz,117,0,37.5099,a8w8_bpreshuffle_256x112x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0,0,0 +80,224,4096,512,torch.float8_e4m3fnuz,76,0,10.187,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,224,4096,7168,torch.float8_e4m3fnuz,120,0,52.8765,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,224,4608,7168,torch.float8_e4m3fnuz,124,0,56.3824,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,224,7168,256,torch.float8_e4m3fnuz,73,0,10.9003,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,224,7168,2304,torch.float8_e4m3fnuz,85,0,32.8835,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,224,7168,18432,torch.float8_e4m3fnuz,85,0,207.6011,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,224,8192,1536,torch.float8_e4m3fnuz,100,0,28.8459,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,224,11264,1536,torch.float8_e4m3fnuz,100,0,37.6656,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,224,32768,512,torch.float8_e4m3fnuz,84,0,45.0439,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,224,32768,1536,torch.float8_e4m3fnuz,85,0,91.0766,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,224,36864,7168,torch.float8_e4m3fnuz,40,0,377.2671,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,256,128,7168,torch.float8_e4m3fnuz,10,0,10.8778,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,256,4096,7168,torch.float8_e4m3fnuz,85,0,60.0334,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,256,7168,18432,torch.float8_e4m3fnuz,85,0,209.5107,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,256,8192,1536,torch.float8_e4m3fnuz,85,0,30.3059,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,256,32768,512,torch.float8_e4m3fnuz,85,0,46.3119,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,256,32768,1536,torch.float8_e4m3fnuz,85,0,92.2438,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,256,36864,7168,torch.float8_e4m3fnuz,68,0,371.2046,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,288,128,7168,torch.float8_e4m3fnuz,10,0,10.7298,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,288,512,7168,torch.float8_e4m3fnuz,6,0,17.6475,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,288,576,7168,torch.float8_e4m3fnuz,113,0,21.2123,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,288,1536,7168,torch.float8_e4m3fnuz,120,0,29.6846,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,288,2240,7168,torch.float8_e4m3fnuz,113,0,44.5,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,288,4096,512,torch.float8_e4m3fnuz,85,0,10.8502,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,288,4096,7168,torch.float8_e4m3fnuz,85,0,62.9998,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,288,4608,7168,torch.float8_e4m3fnuz,130,0,66.7012,a8w8_bpreshuffle_256x96x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,288,7168,256,torch.float8_e4m3fnuz,72,0,12.7115,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,288,7168,2304,torch.float8_e4m3fnuz,85,0,40.9768,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,288,7168,18432,torch.float8_e4m3fnuz,85,0,272.1414,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,288,8192,1536,torch.float8_e4m3fnuz,85,0,32.0963,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,288,11264,1536,torch.float8_e4m3fnuz,102,0,42.7952,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,288,32768,512,torch.float8_e4m3fnuz,84,0,53.506,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,288,32768,1536,torch.float8_e4m3fnuz,102,0,101.9306,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,288,36864,7168,torch.float8_e4m3fnuz,102,0,437.3066,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,320,128,7168,torch.float8_e4m3fnuz,24,0,10.9094,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0,0,0 +80,320,512,7168,torch.float8_e4m3fnuz,6,0,17.8315,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,320,576,7168,torch.float8_e4m3fnuz,112,0,22.5067,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,320,1536,7168,torch.float8_e4m3fnuz,126,0,32.1157,a8w8_bpreshuffle_256x32x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,320,2240,7168,torch.float8_e4m3fnuz,78,0,53.8992,a8w8_bpreshuffle_256x96x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,320,4096,512,torch.float8_e4m3fnuz,84,0,10.783,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,320,4096,7168,torch.float8_e4m3fnuz,121,0,63.2822,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,320,4608,7168,torch.float8_e4m3fnuz,136,0,72.1189,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,320,7168,256,torch.float8_e4m3fnuz,74,0,13.0231,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,320,7168,2304,torch.float8_e4m3fnuz,85,0,40.2392,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,320,7168,18432,torch.float8_e4m3fnuz,101,0,270.4958,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,320,8192,1536,torch.float8_e4m3fnuz,101,0,32.6299,a8w8_bpreshuffle_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,320,11264,1536,torch.float8_e4m3fnuz,85,0,45.0808,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,320,32768,512,torch.float8_e4m3fnuz,85,0,55.0543,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,320,32768,1536,torch.float8_e4m3fnuz,85,0,114.3911,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,320,36864,7168,torch.float8_e4m3fnuz,128,0,476.2091,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,352,128,7168,torch.float8_e4m3fnuz,10,0,11.2254,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,352,512,7168,torch.float8_e4m3fnuz,112,0,22.2763,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,352,576,7168,torch.float8_e4m3fnuz,112,0,22.5607,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,352,1536,7168,torch.float8_e4m3fnuz,133,0,39.9737,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,352,2240,7168,torch.float8_e4m3fnuz,78,0,54.0168,a8w8_bpreshuffle_256x96x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,352,4096,512,torch.float8_e4m3fnuz,84,0,12.0603,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,352,4096,7168,torch.float8_e4m3fnuz,136,0,74.9351,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,352,4608,7168,torch.float8_e4m3fnuz,128,0,84.2541,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,352,7168,256,torch.float8_e4m3fnuz,75,0,15.1695,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1,0,0,0 +80,352,7168,2304,torch.float8_e4m3fnuz,100,0,49.1372,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,352,7168,18432,torch.float8_e4m3fnuz,86,0,324.6944,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,352,8192,1536,torch.float8_e4m3fnuz,85,0,38.7583,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,352,11264,1536,torch.float8_e4m3fnuz,85,0,51.422,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,352,32768,512,torch.float8_e4m3fnuz,72,0,64.3624,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,352,32768,1536,torch.float8_e4m3fnuz,71,0,132.552,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,352,36864,7168,torch.float8_e4m3fnuz,71,0,568.6663,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,384,128,7168,torch.float8_e4m3fnuz,10,0,11.3822,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,384,512,7168,torch.float8_e4m3fnuz,113,0,21.2655,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,384,576,7168,torch.float8_e4m3fnuz,113,0,21.2619,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,384,1536,7168,torch.float8_e4m3fnuz,114,0,37.6905,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,384,2240,7168,torch.float8_e4m3fnuz,78,0,53.062,a8w8_bpreshuffle_256x96x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,384,4096,512,torch.float8_e4m3fnuz,84,0,12.2599,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,384,4096,7168,torch.float8_e4m3fnuz,86,0,81.866,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,384,4608,7168,torch.float8_e4m3fnuz,93,0,81.9457,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,384,7168,256,torch.float8_e4m3fnuz,72,0,15.1515,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,384,7168,2304,torch.float8_e4m3fnuz,86,0,48.0036,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,384,7168,18432,torch.float8_e4m3fnuz,86,0,300.0663,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,384,8192,1536,torch.float8_e4m3fnuz,85,0,38.4623,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,384,11264,1536,torch.float8_e4m3fnuz,85,0,51.3588,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,384,32768,512,torch.float8_e4m3fnuz,85,0,63.438,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,384,32768,1536,torch.float8_e4m3fnuz,71,0,132.9488,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,384,36864,7168,torch.float8_e4m3fnuz,94,0,555.2415,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,416,128,7168,torch.float8_e4m3fnuz,24,0,11.6798,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,0,0,0 +80,416,512,7168,torch.float8_e4m3fnuz,112,0,22.5591,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,416,576,7168,torch.float8_e4m3fnuz,112,0,22.8351,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,416,1536,7168,torch.float8_e4m3fnuz,119,0,38.3401,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,416,2240,7168,torch.float8_e4m3fnuz,113,0,62.1588,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,416,3072,1536,torch.float8_e4m3fnuz,100,0,20.7247,a8w8_bpreshuffle_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,416,4096,512,torch.float8_e4m3fnuz,85,0,13.5035,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,416,4096,7168,torch.float8_e4m3fnuz,85,0,86.134,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,416,4608,7168,torch.float8_e4m3fnuz,138,0,101.4186,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,416,7168,256,torch.float8_e4m3fnuz,74,0,15.9059,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,416,7168,2048,torch.float8_e4m3fnuz,85,0,48.1552,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,416,7168,2304,torch.float8_e4m3fnuz,85,0,51.2564,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,416,7168,16384,torch.float8_e4m3fnuz,85,0,310.2738,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,416,7168,18432,torch.float8_e4m3fnuz,85,0,352.9218,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,416,8192,1536,torch.float8_e4m3fnuz,72,0,43.8675,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,416,11264,1536,torch.float8_e4m3fnuz,85,0,58.2264,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,416,24576,1536,torch.float8_e4m3fnuz,102,0,122.5387,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,416,32768,512,torch.float8_e4m3fnuz,84,0,73.9008,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,416,32768,1536,torch.float8_e4m3fnuz,85,0,155.3704,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,416,36864,7168,torch.float8_e4m3fnuz,93,0,649.7863,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,128,7168,torch.float8_e4m3fnuz,10,0,11.523,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,448,512,7168,torch.float8_e4m3fnuz,112,0,22.7188,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,448,576,7168,torch.float8_e4m3fnuz,112,0,22.8423,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,448,1536,7168,torch.float8_e4m3fnuz,113,0,46.246,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,448,2240,7168,torch.float8_e4m3fnuz,114,0,70.1253,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,3072,1536,torch.float8_e4m3fnuz,92,0,22.3115,a8w8_bpreshuffle_256x32x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,448,4096,512,torch.float8_e4m3fnuz,85,0,13.3723,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,4096,7168,torch.float8_e4m3fnuz,85,0,84.5872,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,4608,7168,torch.float8_e4m3fnuz,138,0,97.0854,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,448,7168,256,torch.float8_e4m3fnuz,72,0,15.6539,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,448,7168,2048,torch.float8_e4m3fnuz,85,0,47.8488,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,7168,2304,torch.float8_e4m3fnuz,85,0,49.72,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,7168,16384,torch.float8_e4m3fnuz,85,0,316.355,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,7168,18432,torch.float8_e4m3fnuz,85,0,356.7658,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,8192,1536,torch.float8_e4m3fnuz,72,0,43.8247,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,448,11264,1536,torch.float8_e4m3fnuz,85,0,57.7888,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,24576,1536,torch.float8_e4m3fnuz,93,0,117.3539,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,32768,512,torch.float8_e4m3fnuz,85,0,72.2692,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,32768,1536,torch.float8_e4m3fnuz,85,0,153.3388,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,448,36864,7168,torch.float8_e4m3fnuz,93,0,631.7091,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,128,7168,torch.float8_e4m3fnuz,10,0,11.607,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0,0,0 +80,480,512,7168,torch.float8_e4m3fnuz,113,0,21.5703,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,480,576,7168,torch.float8_e4m3fnuz,112,0,24.5819,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,480,1536,7168,torch.float8_e4m3fnuz,113,0,45.4048,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,480,2240,7168,torch.float8_e4m3fnuz,62,0,70.4585,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,3072,1536,torch.float8_e4m3fnuz,94,0,22.3915,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,4096,512,torch.float8_e4m3fnuz,86,0,14.0771,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,4096,7168,torch.float8_e4m3fnuz,86,0,82.8028,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,4608,7168,torch.float8_e4m3fnuz,56,0,105.9346,a8w8_bpreshuffle_256x160x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,7168,256,torch.float8_e4m3fnuz,72,0,16.1387,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,480,7168,2048,torch.float8_e4m3fnuz,102,0,52.4824,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,7168,2304,torch.float8_e4m3fnuz,102,0,56.9516,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,7168,16384,torch.float8_e4m3fnuz,86,0,346.0471,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,7168,18432,torch.float8_e4m3fnuz,102,0,379.6631,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,8192,1536,torch.float8_e4m3fnuz,102,0,43.0079,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,11264,1536,torch.float8_e4m3fnuz,102,0,64.1117,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,24576,1536,torch.float8_e4m3fnuz,102,0,119.4687,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,32768,512,torch.float8_e4m3fnuz,102,0,81.2012,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,32768,1536,torch.float8_e4m3fnuz,102,0,155.1241,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,480,36864,7168,torch.float8_e4m3fnuz,102,0,655.0492,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,512,128,7168,torch.float8_e4m3fnuz,11,0,12.1131,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,0,0,0 +80,512,4096,7168,torch.float8_e4m3fnuz,138,0,102.7541,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,512,7168,16384,torch.float8_e4m3fnuz,71,0,346.2792,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,512,7168,18432,torch.float8_e4m3fnuz,70,0,389.6687,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,512,8192,1536,torch.float8_e4m3fnuz,85,0,50.0471,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,512,24576,1536,torch.float8_e4m3fnuz,93,0,126.7363,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,512,32768,512,torch.float8_e4m3fnuz,85,0,82.7804,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,512,32768,1536,torch.float8_e4m3fnuz,71,0,167.8113,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,512,36864,7168,torch.float8_e4m3fnuz,68,0,726.0635,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,1024,128,7168,torch.float8_e4m3fnuz,6,0,17.8979,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,1024,4096,7168,torch.float8_e4m3fnuz,85,0,179.0322,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,1024,7168,16384,torch.float8_e4m3fnuz,85,0,661.406,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,1024,7168,18432,torch.float8_e4m3fnuz,85,0,745.9611,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,1024,8192,1536,torch.float8_e4m3fnuz,71,0,90.3089,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,1024,24576,1536,torch.float8_e4m3fnuz,68,0,241.8128,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,1024,32768,512,torch.float8_e4m3fnuz,71,0,148.0686,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,1024,32768,1536,torch.float8_e4m3fnuz,71,0,317.4319,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,1024,36864,7168,torch.float8_e4m3fnuz,93,0,1406.3674,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,2048,128,7168,torch.float8_e4m3fnuz,119,0,24.0187,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,2048,4096,7168,torch.float8_e4m3fnuz,85,0,328.802,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,2048,7168,16384,torch.float8_e4m3fnuz,85,0,1256.6524,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,2048,7168,18432,torch.float8_e4m3fnuz,85,0,1411.2909,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,2048,8192,1536,torch.float8_e4m3fnuz,71,0,160.9323,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,2048,24576,1536,torch.float8_e4m3fnuz,71,0,459.7917,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,2048,32768,512,torch.float8_e4m3fnuz,71,0,283.8954,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,2048,32768,1536,torch.float8_e4m3fnuz,71,0,610.8983,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,2048,36864,7168,torch.float8_e4m3fnuz,93,0,2759.912,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,4096,128,7168,torch.float8_e4m3fnuz,121,0,38.9415,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,4096,4096,7168,torch.float8_e4m3fnuz,85,0,645.3373,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,4096,7168,16384,torch.float8_e4m3fnuz,85,0,2443.998,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,4096,7168,18432,torch.float8_e4m3fnuz,85,0,2773.2274,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,4096,8192,1536,torch.float8_e4m3fnuz,71,0,307.3715,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,4096,24576,1536,torch.float8_e4m3fnuz,72,0,920.9012,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,4096,32768,512,torch.float8_e4m3fnuz,71,0,554.3222,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,4096,32768,1536,torch.float8_e4m3fnuz,71,0,1193.686,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,4096,36864,7168,torch.float8_e4m3fnuz,93,0,5492.6804,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,128,7168,torch.float8_e4m3fnuz,123,0,49.1848,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,6144,512,7168,torch.float8_e4m3fnuz,85,0,143.4871,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,576,7168,torch.float8_e4m3fnuz,68,0,151.638,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,1536,7168,torch.float8_e4m3fnuz,93,0,368.8956,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,2240,7168,torch.float8_e4m3fnuz,69,0,662.2802,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,3072,1536,torch.float8_e4m3fnuz,85,0,183.0209,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,4096,512,torch.float8_e4m3fnuz,71,0,109.7738,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,4096,7168,torch.float8_e4m3fnuz,102,0,937.0884,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,4608,7168,torch.float8_e4m3fnuz,102,0,1071.1237,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,7168,256,torch.float8_e4m3fnuz,71,0,123.4161,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,7168,2048,torch.float8_e4m3fnuz,71,0,509.0789,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,7168,2304,torch.float8_e4m3fnuz,71,0,560.4929,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,7168,16384,torch.float8_e4m3fnuz,85,0,3671.969,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,7168,18432,torch.float8_e4m3fnuz,85,0,4139.4899,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,8192,1536,torch.float8_e4m3fnuz,71,0,455.0275,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,11264,1536,torch.float8_e4m3fnuz,71,0,614.8496,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,24576,1536,torch.float8_e4m3fnuz,71,0,1348.8178,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,32768,512,torch.float8_e4m3fnuz,71,0,793.4262,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,32768,1536,torch.float8_e4m3fnuz,102,0,1794.6908,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,6144,36864,7168,torch.float8_e4m3fnuz,93,0,8229.8778,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,8192,128,7168,torch.float8_e4m3fnuz,124,0,61.8924,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,8192,4096,7168,torch.float8_e4m3fnuz,85,0,1259.5274,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,8192,7168,16384,torch.float8_e4m3fnuz,85,0,4942.7227,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,8192,7168,18432,torch.float8_e4m3fnuz,85,0,5501.8021,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,8192,8192,1536,torch.float8_e4m3fnuz,72,0,621.3308,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,8192,24576,1536,torch.float8_e4m3fnuz,71,0,1786.9965,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,8192,32768,512,torch.float8_e4m3fnuz,71,0,1053.245,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,8192,32768,1536,torch.float8_e4m3fnuz,71,0,2345.8711,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,8192,36864,7168,torch.float8_e4m3fnuz,93,0,10983.7146,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,128,7168,torch.float8_e4m3fnuz,121,0,63.2764,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,10240,512,7168,torch.float8_e4m3fnuz,0,0,211.3264,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,576,7168,torch.float8_e4m3fnuz,68,0,230.3376,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,1536,7168,torch.float8_e4m3fnuz,68,0,583.4998,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,2240,7168,torch.float8_e4m3fnuz,69,0,1018.0438,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,3072,1536,torch.float8_e4m3fnuz,71,0,290.0173,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,4096,512,torch.float8_e4m3fnuz,71,0,173.8713,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,4096,7168,torch.float8_e4m3fnuz,85,0,1574.2695,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,4608,7168,torch.float8_e4m3fnuz,93,0,1722.172,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,7168,256,torch.float8_e4m3fnuz,71,0,199.187,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,7168,2048,torch.float8_e4m3fnuz,71,0,843.0746,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,7168,2304,torch.float8_e4m3fnuz,71,0,913.7132,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,7168,16384,torch.float8_e4m3fnuz,85,0,6072.4257,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,7168,18432,torch.float8_e4m3fnuz,85,0,6834.6666,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,8192,1536,torch.float8_e4m3fnuz,71,0,754.6592,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,11264,1536,torch.float8_e4m3fnuz,71,0,1039.4493,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,24576,1536,torch.float8_e4m3fnuz,71,0,2209.3759,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,32768,512,torch.float8_e4m3fnuz,72,0,1357.7711,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,10240,32768,1536,torch.float8_e4m3fnuz,71,0,2923.8951,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,10240,36864,7168,torch.float8_e4m3fnuz,93,0,13738.3404,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,128,7168,torch.float8_e4m3fnuz,141,0,84.5069,a8w8_bpreshuffle_256x160x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,12288,512,7168,torch.float8_e4m3fnuz,0,0,262.6171,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,576,7168,torch.float8_e4m3fnuz,94,0,285.6522,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,1536,7168,torch.float8_e4m3fnuz,94,0,718.7933,a8w8_bpreshuffle_256x96x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,2240,7168,torch.float8_e4m3fnuz,69,0,1231.0498,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,3072,1536,torch.float8_e4m3fnuz,72,0,352.0628,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,12288,4096,512,torch.float8_e4m3fnuz,71,0,207.219,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,4096,7168,torch.float8_e4m3fnuz,102,0,1858.9016,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,4608,7168,torch.float8_e4m3fnuz,68,0,2079.1883,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,7168,256,torch.float8_e4m3fnuz,71,0,243.398,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,7168,2048,torch.float8_e4m3fnuz,71,0,1001.5776,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,7168,2304,torch.float8_e4m3fnuz,72,0,1134.7441,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,12288,7168,16384,torch.float8_e4m3fnuz,102,0,7272.8109,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,7168,18432,torch.float8_e4m3fnuz,102,0,8054.046,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,8192,1536,torch.float8_e4m3fnuz,102,0,912.1361,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,11264,1536,torch.float8_e4m3fnuz,71,0,1221.6133,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,24576,1536,torch.float8_e4m3fnuz,71,0,2656.0688,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,32768,512,torch.float8_e4m3fnuz,71,0,1573.8691,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,32768,1536,torch.float8_e4m3fnuz,71,0,3526.7572,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,12288,36864,7168,torch.float8_e4m3fnuz,93,0,16426.5845,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,128,7168,torch.float8_e4m3fnuz,86,0,91.5501,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,512,7168,torch.float8_e4m3fnuz,102,0,306.9194,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,576,7168,torch.float8_e4m3fnuz,93,0,341.468,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,1536,7168,torch.float8_e4m3fnuz,93,0,830.4984,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,2240,7168,torch.float8_e4m3fnuz,69,0,1461.7946,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,3072,1536,torch.float8_e4m3fnuz,93,0,405.8954,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,4096,512,torch.float8_e4m3fnuz,71,0,239.3595,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,4096,7168,torch.float8_e4m3fnuz,85,0,2197.8676,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,4608,7168,torch.float8_e4m3fnuz,93,0,2431.1066,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,7168,256,torch.float8_e4m3fnuz,72,0,297.7928,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,14336,7168,2048,torch.float8_e4m3fnuz,74,0,1199.9511,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,14336,7168,2304,torch.float8_e4m3fnuz,71,0,1310.3925,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,7168,16384,torch.float8_e4m3fnuz,85,0,8455.8581,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,7168,18432,torch.float8_e4m3fnuz,85,0,9585.3754,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,8192,1536,torch.float8_e4m3fnuz,71,0,1039.5469,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,11264,1536,torch.float8_e4m3fnuz,71,0,1422.2274,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,24576,1536,torch.float8_e4m3fnuz,71,0,3089.1315,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,32768,512,torch.float8_e4m3fnuz,71,0,1827.3845,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,32768,1536,torch.float8_e4m3fnuz,71,0,4108.8811,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,14336,36864,7168,torch.float8_e4m3fnuz,93,0,19165.8682,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,16384,128,7168,torch.float8_e4m3fnuz,0,0,114.6838,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,16384,4096,7168,torch.float8_e4m3fnuz,85,0,2498.7773,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,16384,7168,16384,torch.float8_e4m3fnuz,85,0,9679.208,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,16384,7168,18432,torch.float8_e4m3fnuz,85,0,10922.6859,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,16384,8192,1536,torch.float8_e4m3fnuz,71,0,1182.4869,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,16384,24576,1536,torch.float8_e4m3fnuz,71,0,3536.4999,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,16384,32768,512,torch.float8_e4m3fnuz,71,0,2090.8366,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,16384,32768,1536,torch.float8_e4m3fnuz,71,0,4672.9569,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,16384,36864,7168,torch.float8_e4m3fnuz,71,0,21975.6028,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,32768,128,7168,torch.float8_e4m3fnuz,121,0,208.4886,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,32768,576,7168,torch.float8_e4m3fnuz,68,0,729.7961,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,32768,1536,7168,torch.float8_e4m3fnuz,68,0,1872.4274,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,32768,3072,1536,torch.float8_e4m3fnuz,71,0,909.865,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,32768,4096,7168,torch.float8_e4m3fnuz,74,0,5016.1101,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,32768,7168,2048,torch.float8_e4m3fnuz,72,0,2704.1764,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,32768,7168,16384,torch.float8_e4m3fnuz,85,0,19294.1288,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,32768,7168,18432,torch.float8_e4m3fnuz,85,0,21856.5731,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,32768,8192,1536,torch.float8_e4m3fnuz,71,0,2344.2068,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,32768,24576,1536,torch.float8_e4m3fnuz,71,0,7017.8836,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,32768,32768,512,torch.float8_e4m3fnuz,71,0,4165.6734,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,32768,32768,1536,torch.float8_e4m3fnuz,71,0,9432.9791,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,32768,36864,7168,torch.float8_e4m3fnuz,93,0,43711.479,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,128,7168,torch.float8_e4m3fnuz,121,0,383.2232,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,0,0,0 +80,65536,512,7168,torch.float8_e4m3fnuz,70,0,1327.0618,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,576,7168,torch.float8_e4m3fnuz,93,0,1429.9239,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,1536,7168,torch.float8_e4m3fnuz,93,0,3736.9156,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,3072,1536,torch.float8_e4m3fnuz,71,0,1787.3796,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,4096,512,torch.float8_e4m3fnuz,71,0,1055.1661,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,4096,7168,torch.float8_e4m3fnuz,85,0,9976.6212,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,4608,7168,torch.float8_e4m3fnuz,68,0,11016.6648,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,7168,256,torch.float8_e4m3fnuz,71,0,1212.7558,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,7168,2048,torch.float8_e4m3fnuz,71,0,5287.9696,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,7168,2304,torch.float8_e4m3fnuz,71,0,5871.5636,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,7168,16384,torch.float8_e4m3fnuz,85,0,38748.1812,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,7168,18432,torch.float8_e4m3fnuz,85,0,43467.0784,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,8192,1536,torch.float8_e4m3fnuz,71,0,4684.7126,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,24576,1536,torch.float8_e4m3fnuz,0,0,inf,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,32768,512,torch.float8_e4m3fnuz,74,0,,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1,0,0,0 +80,65536,32768,1536,torch.float8_e4m3fnuz,0,0,,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,65536,36864,7168,torch.float8_e4m3fnuz,0,0,,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,128,7168,torch.float8_e4m3fnuz,87,0,561.5334,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0,0,0 +80,98304,512,7168,torch.float8_e4m3fnuz,102,0,1866.4062,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,576,7168,torch.float8_e4m3fnuz,95,0,2112.9835,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,0,0,0 +80,98304,1536,7168,torch.float8_e4m3fnuz,102,0,5497.497,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,2240,7168,torch.float8_e4m3fnuz,69,0,9755.5399,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,3072,1536,torch.float8_e4m3fnuz,102,0,2698.4705,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,4096,512,torch.float8_e4m3fnuz,71,0,1607.6098,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,4096,7168,torch.float8_e4m3fnuz,102,0,14573.0079,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,4608,7168,torch.float8_e4m3fnuz,102,0,16377.2032,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,7168,256,torch.float8_e4m3fnuz,72,0,1997.5301,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,98304,7168,2048,torch.float8_e4m3fnuz,72,0,8086.5215,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,0,0,0 +80,98304,7168,2304,torch.float8_e4m3fnuz,71,0,8879.3992,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,7168,16384,torch.float8_e4m3fnuz,102,0,57025.1637,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,7168,18432,torch.float8_e4m3fnuz,102,0,63990.1906,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,8192,1536,torch.float8_e4m3fnuz,71,0,7034.6763,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,11264,1536,torch.float8_e4m3fnuz,71,0,9673.3902,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,24576,1536,torch.float8_e4m3fnuz,0,0,,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,98304,32768,1536,torch.float8_e4m3fnuz,0,0,,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,128,7168,torch.float8_e4m3fnuz,0,0,719.3472,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,512,7168,torch.float8_e4m3fnuz,102,0,2615.157,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,576,7168,torch.float8_e4m3fnuz,93,0,2822.5197,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,1536,7168,torch.float8_e4m3fnuz,68,0,7429.8789,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,3072,1536,torch.float8_e4m3fnuz,71,0,3559.9187,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,4096,512,torch.float8_e4m3fnuz,71,0,2125.6078,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,4096,7168,torch.float8_e4m3fnuz,85,0,19941.3998,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,4608,7168,torch.float8_e4m3fnuz,93,0,21987.1209,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,7168,256,torch.float8_e4m3fnuz,71,0,2444.3347,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,7168,2048,torch.float8_e4m3fnuz,71,0,10519.7293,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,7168,2304,torch.float8_e4m3fnuz,71,0,11621.1373,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,7168,16384,torch.float8_e4m3fnuz,85,0,77358.0635,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,7168,18432,torch.float8_e4m3fnuz,85,0,87126.4572,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,8192,1536,torch.float8_e4m3fnuz,71,0,9394.7156,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 +80,131072,24576,1536,torch.float8_e4m3fnuz,0,0,,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,0,0,0 From 48e3a377e70e3040c294cbe4e991faafd66f0205 Mon Sep 17 00:00:00 2001 From: minmengdie Date: Fri, 21 Nov 2025 02:20:06 +0000 Subject: [PATCH 05/10] update the smoke test --- op_tests/cpp/mha/smoke_test_fwd_v3.sh | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/op_tests/cpp/mha/smoke_test_fwd_v3.sh b/op_tests/cpp/mha/smoke_test_fwd_v3.sh index 1c74659fb3..e55e8119b8 100644 --- a/op_tests/cpp/mha/smoke_test_fwd_v3.sh +++ b/op_tests/cpp/mha/smoke_test_fwd_v3.sh @@ -24,10 +24,9 @@ run_gfx950_fwd_v3() { for mask in 0 2 ; do for lse in 0 1 ; do for seqlen_q in 127 192 301 512 1024; do - for seqlen_k in 512 700 1023 1058; do + for seqlen_k in 0 512 700 1023 1058; do $EXE -prec=bf16 -b=2 -h=4 -h_k=2 -d=$head_dim -d_v=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -mode=$mode -kname=$KNAME $COMMON_ARGS - $EXE -prec=bf16 -b=2 -h=4 -h_k=2 -d=$head_dim -d_v=128 -s=$seqlen_q,$seqlen_k -s_k=$seqlen_k,0 -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -mode=$mode -kname=$KNAME $COMMON_ARGS $EXE -prec=bf16 -b=1 -h=3 -h_k=1 -d=$head_dim -d_v=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -mode=$mode -kname=$KNAME $COMMON_ARGS $EXE -prec=bf16 -b=1 -h=1 -h_k=1 -d=$head_dim -d_v=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -mode=$mode -kname=$KNAME $COMMON_ARGS @@ -37,6 +36,10 @@ run_gfx950_fwd_v3() { $EXE -prec=bf16 -b=1 -h=1 -h_k=1 -d=$head_dim -d_v=128 -s=$seqlen_q -s_k=$seqlen_q -iperm=$i_perm -operm=$o_perm -mask=1 -lse=$lse -fwd_v3=1 -mode=$mode -kname=$KNAME $COMMON_ARGS fi + if [[ "$mode" = "1" ]]; then + $EXE -prec=bf16 -b=2 -h=4 -h_k=2 -d=128 -s=$seqlen_q,$seqlen_k -s_k=$seqlen_k,0 -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -v3_bf16_cvt=$v3_bf16_cvt -mode=$mode -kname=$KNAME $COMMON_ARGS + fi + done done done @@ -49,17 +52,16 @@ run_gfx950_fwd_v3() { run_gfx942_fwd_v3() { echo "Start smoke test for gfx 942" - for mode in 0 1 ; do + for mode in 1 ; do for i_perm in 0 1 ; do for o_perm in 0 1 ; do for mask in 0 2 ; do for lse in 0 1 ; do for seqlen_q in 127 192 301 512 1024; do - for seqlen_k in 512 700 1023 1058; do + for seqlen_k in 0 512 700 1023 1058; do for v3_bf16_cvt in 0 1 2; do $EXE -prec=bf16 -b=2 -h=4 -h_k=2 -d=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -v3_bf16_cvt=$v3_bf16_cvt -mode=$mode -kname=$KNAME $COMMON_ARGS - $EXE -prec=bf16 -b=2 -h=4 -h_k=2 -d=128 -s=$seqlen_q,$seqlen_k -s_k=$seqlen_k,0 -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -v3_bf16_cvt=$v3_bf16_cvt -mode=$mode -kname=$KNAME $COMMON_ARGS $EXE -prec=bf16 -b=1 -h=3 -h_k=1 -d=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -v3_bf16_cvt=$v3_bf16_cvt -mode=$mode -kname=$KNAME $COMMON_ARGS $EXE -prec=bf16 -b=1 -h=1 -h_k=1 -d=128 -s=$seqlen_q -s_k=$seqlen_k -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -v3_bf16_cvt=$v3_bf16_cvt -mode=$mode -kname=$KNAME $COMMON_ARGS @@ -69,6 +71,10 @@ run_gfx942_fwd_v3() { $EXE -prec=bf16 -b=1 -h=1 -h_k=1 -d=128 -s=$seqlen_q -s_k=$seqlen_q -iperm=$i_perm -operm=$o_perm -mask=1 -lse=$lse -fwd_v3=1 -v3_bf16_cvt=$v3_bf16_cvt -mode=$mode -kname=$KNAME $COMMON_ARGS fi + if [[ "$mode" = "1" ]]; then + $EXE -prec=bf16 -b=2 -h=4 -h_k=2 -d=128 -s=$seqlen_q,$seqlen_k -s_k=$seqlen_k,0 -iperm=$i_perm -operm=$o_perm -mask=$mask -lse=$lse -fwd_v3=1 -v3_bf16_cvt=$v3_bf16_cvt -mode=$mode -kname=$KNAME $COMMON_ARGS + fi + done done done From f6876c5477f8af007cf243b501cd55e44972a151 Mon Sep 17 00:00:00 2001 From: minmengdie Date: Fri, 21 Nov 2025 02:28:12 +0000 Subject: [PATCH 06/10] update the smoke test --- op_tests/cpp/mha/smoke_test_fwd_v3.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/op_tests/cpp/mha/smoke_test_fwd_v3.sh b/op_tests/cpp/mha/smoke_test_fwd_v3.sh index e55e8119b8..407ab2cd14 100644 --- a/op_tests/cpp/mha/smoke_test_fwd_v3.sh +++ b/op_tests/cpp/mha/smoke_test_fwd_v3.sh @@ -52,7 +52,7 @@ run_gfx950_fwd_v3() { run_gfx942_fwd_v3() { echo "Start smoke test for gfx 942" - for mode in 1 ; do + for mode in 0 1 ; do for i_perm in 0 1 ; do for o_perm in 0 1 ; do for mask in 0 2 ; do From 46397e4302271d4320f1b84fc0d0364ddfff4224 Mon Sep 17 00:00:00 2001 From: minmengdie Date: Fri, 21 Nov 2025 02:59:29 +0000 Subject: [PATCH 07/10] fix MI300 and MI308 err --- .../MI300/fwd_hd128_bf16_causal_rtna.co | Bin 29792 -> 29640 bytes .../MI300/fwd_hd128_bf16_causal_rtna_group.co | Bin 29968 -> 29816 bytes .../MI300/fwd_hd128_bf16_causal_rtne.co | Bin 31392 -> 31240 bytes .../MI300/fwd_hd128_bf16_causal_rtne_group.co | Bin 31568 -> 31416 bytes .../MI300/fwd_hd128_bf16_causal_rtz.co | Bin 25952 -> 25800 bytes .../MI300/fwd_hd128_bf16_causal_rtz_group.co | Bin 26128 -> 25976 bytes .../fmha_v3_fwd/MI300/fwd_hd128_bf16_rtna.co | Bin 27800 -> 27648 bytes .../MI300/fwd_hd128_bf16_rtna_group.co | Bin 27928 -> 27776 bytes .../fmha_v3_fwd/MI300/fwd_hd128_bf16_rtne.co | Bin 29400 -> 29248 bytes .../MI300/fwd_hd128_bf16_rtne_group.co | Bin 29528 -> 29376 bytes .../fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz.co | Bin 23952 -> 23800 bytes .../MI300/fwd_hd128_bf16_rtz_group.co | Bin 24088 -> 23936 bytes .../MI308/fwd_hd128_bf16_causal_rtna.co | Bin 29128 -> 28976 bytes .../MI308/fwd_hd128_bf16_causal_rtna_group.co | Bin 29304 -> 29152 bytes .../MI308/fwd_hd128_bf16_causal_rtne.co | Bin 30728 -> 30576 bytes .../MI308/fwd_hd128_bf16_causal_rtne_group.co | Bin 30904 -> 30752 bytes .../MI308/fwd_hd128_bf16_causal_rtz.co | Bin 25288 -> 25136 bytes .../MI308/fwd_hd128_bf16_causal_rtz_group.co | Bin 25464 -> 25312 bytes .../fmha_v3_fwd/MI308/fwd_hd128_bf16_rtna.co | Bin 27072 -> 26920 bytes .../MI308/fwd_hd128_bf16_rtna_group.co | Bin 27200 -> 27048 bytes .../fmha_v3_fwd/MI308/fwd_hd128_bf16_rtne.co | Bin 28672 -> 28520 bytes .../MI308/fwd_hd128_bf16_rtne_group.co | Bin 28800 -> 28648 bytes .../fmha_v3_fwd/MI308/fwd_hd128_bf16_rtz.co | Bin 23224 -> 23072 bytes .../MI308/fwd_hd128_bf16_rtz_group.co | Bin 23360 -> 23208 bytes 24 files changed, 0 insertions(+), 0 deletions(-) diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna.co index 146a54651588c2de718355c3dad699132f06f1ed..c926a2adad985ca755210cb2e44801568abeefe9 100755 GIT binary patch delta 136 zcmaFxg7L(2M$URyA2%jOMhE7Jobrqw8#UuKH&4)`z0Z3epJ?@kjM>H2P)BI6{%lmy>dgdmqja~meBXIkW>4l^ z&uT}b!OXUfbJq@=jK?7nkl=J2MYk11=#EoK6^#O)2~)W4BQwZ?1`-7NJHR%k9yw=h#)3TBJ)my$x0K`&|{5B>o8b7a^6 diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna_group.co index b508c646dc996560ff6ad201c0fbb0a9f9ccc0d1..734193f09a22f7b86e748908cb34a8306ab02fcf 100755 GIT binary patch delta 141 zcmbR6it)z_M$URyA2%jOMhE7JobpUR3N~up)7&heHGzqPy@7kb5O3@L%?oq_9M}vk z^bGV2CNE@^npDg-d0lRjREre@3a|k4*co0FY}U&2V&XG&HDt(1OiIm(H*hm|+}u>K hj+s$>vSEoi(~q3VStWLi7L!+&*fXujpZuys4FEu$DLDWD delta 305 zcmezIf^ottM$URyA2%jOMhBLOobpT)3O8!q)8t@p;NHKDw{`z!4y_4Hn-}N=IMf>% z=mDXEhJH?dUV3U#d_iJKMqXlWYJ72id`U)PNqkX3ZgG5aVqSc4X+c4LQ3+5*a&}^R zW?ni_J~ciiHK{Z`J~J;ZKRzwLC_XVKCqBP8zMv>SEi)&zSiiU^SwAPIELRsOl9ifV z0_G^BnHX4@rrxuTQT6)Q1K d$enzt*pBhSWWf@9E(fSlzKoM^7s)a*000?)VyplF diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne.co index 7d95f3c2808bd9df84d9ff5b117ce749f7a95955..8542efc6ccb13531793853fd81d9791d19be1f68 100755 GIT binary patch delta 138 zcmZ4Rm9gUsBWJylYU#>*wT@<>~@OvQm>v zz#N4%69WtLG_xcVBYBH7gH*FLQ*+bQq{L*4M2kdoBa@WGBts+P6a%wV&B+THr6yk} z3YctBEWoA900Qg`JXV{nitU*A4Gl~j8FCVnQgh-B44sWPx0fztW?Vn{V3Fu#yGkXd hfa1wXm3EvJrC@Cg1z?H#3eL&LD`=)z`CW06REre@3a|k4*cn!oY|bk2Wa2k4bTwqiNlZ%3i8nMbbKHEW iY#lSB_~gPWb4H8FE3538DoQ86sbt#iEnnRVp!F km@HUj&nZv_F{1!1u3yPHxx7k}s{tzN%Q#uOQkIbc07oih!2kdN diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz.co index 4dd3d45a8c7507f1652c2d59e597193bab8d7a2f..0c367eebd6e4a37d76a6a935d43b48824b93a8f2 100755 GIT binary patch delta 136 zcmaEGit)rrM$URyA2%jOMhE7Jobrqw8#UuKH&4)L7=A;(u7Z)Y#=j4>->HJF1lSpPtTz9OwP)gUF?3|eNlZ%3i8nBGHs0)>u#lN?{p7}2(aHakm6!zL fCM%`baatsRbukoxCG3+qC%30aGOn1sFhv~zLUv<< diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz_group.co index 7b08aa9b137d9a9ddbdc3e4495e5fb13504d8d71..244d372e99b193a6c40ee6b670f90212d0b4d4d0 100755 GIT binary patch delta 141 zcmbPmhVjQKM$URyA2%jOMhE7JobpUR5;tny)7&heHGzqPy@7lGC9c-}n-}N=7_u2! z=o#o4OkT(+H7SK{^18SrsTL~+6kq}7u`|3#->eny#l&amYRHh2n3S3mZ{T9=xVb5D h9W$f&WW!W*rXR7Bvr_FCEheu_wP#w9F!@!g8URcMDE9yW delta 305 zcmexyigCgjM$URyA2%jOMhBLOobpT)k~V7H)8t@p;ND-x-MW7>ht>q9%?oq_4C{>y z^ng%7Lq8`!FFmy=z96wABQG&GHNH4Mz9b{DB)+I1w>UmIF)zNjw4flrs064YIXf{u zGcO$|pBkT%npBz|pP84IAD@~vmYI`UtY2J|te=xpma7XC$x2Nw z0do}6Objf{)69}gjN~oS3{uU~OwCPGlM<6H5-k$VjZ9J!lMIcFQw+>fH774*l$tCM zmm&2codE?{fO+f;Jyx41#d$IDxfnY#g$5 zJp(<1$qN~!lo%O6fSp0Z63l=TJYW_(14GtktF(8Fe1;Z|3^|EOsX6fmF3tv<%`%#p l8O0~>%rk*FXD6m- z=A{GWQ{z)olScE{a&>_sS*gh- zV2(nXiGhWAnpu*Gk-SBkL8@7rskv!tQev`2qD7*)kx5EolA)1tih)_G=H!KpQj;4} zIiwh}7*GHYn8(gwV7Ylw>N`e07iR;8oW!KmoOlC63+K)A(wmqW*H2c=HfLf;n;eyG W$GBng%xrs3g$#%(A16P|lm`F;!Cl7y diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtna_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtna_group.co index acfb38006131c73c2e933c1978d55d7432f21e14..9eaf54e9dbdb423dc5a6228ecab6003de74060fb 100755 GIT binary patch delta 156 zcmbPni?QJ*BWJyY=#zk z26_gQ7cxpso{**>wZM`A1&D)r>=Ze5BPFP@AB)KJCd|?YZrIrmaN^} zM6wF~JM8A)A&Q|+zBhf|d%WcbDmF-4jPCDLgl>1AwK^)^jIZW?ZF#VMIDA}}XU3CS zzMr4mEG+Y~a--Q|Jn=~i0_Q=3Cj>GMd4O|B#Uuj{Cy-5r;Avhhycqjo;#WO^F>$9p zgbC$9c?y^@;2DUN(~yy@m8D*bvDxIJN+ROrTWHXZ>DaXAcC5NXP12)XyGvZ`IoQFr z)fr>AZC#Dc9x>z!=~6^{@`vd9JW~GYcuO_q<+`-!CpsDqM+(C(VPLiggYq<3?4Vb< ZifwgH8u2F@cGLy@7lGe4f_*n_aYH9M}vk z^bGV2CNE@^QetEP0d@urOE3dY@PJwD3igW*6-khk7Fe zJs?!j(9g-wOHVC|FGwuO$V<#kjW5oRFUd$Oi7zV1Esjr4%!@BBEhxw@DgmlU&Q46v z%u5H#r^ctGCY7egXXd5l$EW2N#V6+E#OD{s7Zl~EW#*(7>lYU#>*wT@<>~@OvQm>v zz#N4%69WtLG_xcVBYBH7gH*FLQ*+bQq{L*4M2kdoBa@WGBts+P6a%wV&B+THr6wQ9 z=8!s&%YXuSz&v&a1Ix{ivfnZCxjGpzD}Ovri_@% delta 301 zcmX@`l<~$hM$URyA2%jOMhBLOobrq}HfsLX;9zgy-tWxYx_`5Z<^-nAd$eO5>WvKa zfKWk0KPNvgJ+&ylAh9GPFEKYYzBoU=BqOmTzNjF#I6gTsFTS|6pdi1f1gIi8J25>o zFC8eK8lRGyRGJ>2nU|IypO#+~pO}*qpI;nbP?VpRnUh+qUtE-|pOaITs|ys#N=+^S za}?4{3@psk%#uuug>vplbDp66K`PX=(ssCub!E4{p5#*=8QWg YOBUHPW#mncDzf9e0X63Db%7 diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz.co index 7ef5602d630abe2e9fbc0201f0dbdf58cf562af4..91cb20f2a67b159c7ea5d013a57edd4d5e5e049f 100755 GIT binary patch delta 147 zcmbQRoAJj^M$URyA2%jOMhE7JobpT+ksGy|G&XP1n83ur-oU*-pR;xUW*2Q4Z8k#- zJp(<1$qN~!CO1U0$*?nMSTdjh9x#ucfgy3TRrotbK0^yfhMdHt)SP$&CkunkW|39Q ijN+3w#+Wl@giU@GW5*~lSu@t2vm*jx%KOQM(eeP(awMw& delta 308 zcmeydlX1dsM$URyA2%jOMhBLOobpTpQ5&_IG&tBBxc3WlweH`%Mq>igW*2Q4?Rp~v zJs?!j(9g-wOHVC|FGwuO$V<#kjW5oRFUd$Oi7zV1Esjr4%!@BBEhxw@DgmlU&Q46v z%u5H#r^ctGCY7egXXd5l$EW2N#V6+E#OD{s7Zl~EW#*(7>lYU#>*wT@<>~@OvQm>v zz#N4%69WtLG_xcVBYBH7gH*FLQ*+bQq{L*4M2kdoBa@WGBts+P6a%wV&B+THr6xCo zaY!*FGN1q+Fpr(Vz;g4Vuy>4nP8J3XIf+TBIq?RD7S5aJMN}~}uAf{OCOTO!Mu~|b dd~#5X9pi?{6JzW-6(S)fy$4I@M{_bV0042|Un>9r diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz_group.co index ff75aceb0d6c3ce84b88ae1c956705e2e4c7a964..d0d53352d37f39139d6b3ee79a802cb9ea0c8422 100755 GIT binary patch delta 156 zcmbQShp}NdBWJyr^Qc7}??&8Na2GV&QZS}^1!CZ*=Y8#r0IZr&ML&&()3 pxiMUHvR|wc(}S?dMX`3A6CxmH6o4iAV>l;YkCkL(nEWwT9RQo9Dxv@Y delta 302 zcmZqJ%{XHZBWJy~-NkXVwDmzbLxUz{Ibl95;vUsRA=9G{$+7hhajP>^3#0#uQlotU1P zmkyLqjZaBUDou~i%uCCUPs=ZgPt3`Q&o7QID9TUE%tf&&esv)dh-Vr6!kv zISOee1{UUNW=SSS@)l_Zsb*=W=BBAhiOCj;7K!FYCMk(YhDOFI24<<6lNT~dO)iLL zoBS_KN~$7}0R@PIdF%`;EH@j4KV;-{vUFw0NlZ%3i8nBGblj{MSOjPIf03Ty@7jwHc#vR%_=%uY}pJg z^bGV2CNE@^nmi#ZK#IeP0R`xSdF%`qayS3VvS(s8bT*o7m@B;5KX)cGqxj^;EYZpT h3Y3^0WKLEpv}5F$oLFejRgnWR!i;gUc7ZG-0|1SqCOrTE delta 294 zcmdn+i1EZ@M$URyA2%jOMhBLOobpT_c^kEoG&$HCxcBq(w(j5Dqd9?Tvx?3Z+j=7d zJs?!j(9g-wOHVC|FGwuO$V<#kjW5oRFUd$Oi7zV1Esjr4%!@BBEhxw@DgmlU&Q46v z%u5H#r^ctGCY7egXXd5l$EW2N#V6+E#OD{s7Zl~EW#*(7>lYU#>*wT@<>~@OvQm>v zz#N4%69WtLG_xcVBYBH7gH*FLQ*+bQq{L*4M2kdoBa@WGBts+P6a%wV&B+THr6w!n zvrYb&=_Pd`mjMOnf_dx=0#=)ovh107U7buAauSnLbK(sQoj0$`S;Wk^esW-eIa5W} W>+M$URyA2%jOMhE7Jobrqb8#NzmZWhs+z{J7cz`Z}4r*;44B|2Md*$gf8 z4D<{pFJzROERdZc*kQ$h0xTvMW{YoLkZr@nYv^jkkdv5{niFr}>azJ|?s{fM@yU$Y lqLbGZDlsNZzF26_xgZCkxd1HwzJPPGd66Wez~sOpbpYP~D?|VQ delta 280 zcmW-Xy-tHr9L2e4tW_JGUD{1CDikT8i64VF7#E-5p9|be!3&qKIyhJt_nP|zCJt_m z!NsYs;3JsShagU(=S;uzJ705Df1~!;xw?=MI$L~dcTxSO%$4QV!|H6k^Ej*4b|!N5 zvip8K*0ueny=|58Bp^AYxEv-pCQxuFhq#11jSKK_0>vmzd0sXT?*<295;UHGPux)e zVL~}jo&#nKcmZk7X~;;S7P+S~b{p?COGLc#A9~c#Y>S$%p?7Vn5t9zA0dcWsV;ft# z;bXV2ANZO{6!DBU#V219_mL~=Xk*2e{?d{rN7YYsDR-Q6Lt@w^479#eUB}Y}G!-bb UP%RF#Q`wHiU3P+A#YeXF3sx9nlK=n! diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtne.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtne.co index 400da07fb47337980f25d70f84a1f97f68115fa2..0baf92f49aba1dc0d25f1777ea661ce9376ac378 100755 GIT binary patch delta 144 zcmeD9!1&=EBWJylYU#>*wT@<>~@OvQm>v zz#N4%69WtLG_xcVBYBH7gH*FLQ*+bQq{L*4M2kdoBa@WGBts+P6a%wV&B+THr6wno zvQ4%t@RFKP$bbTL!8~>b0jtf63hbG9-5gCAauSnLbK(sQoj1QLTExt_e)7aJbH*K$ VPnOv;KA6l|ZqF1@JlU&U4FL4)VQByW diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtne_group.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtne_group.co index 4dd1b5f0c20935eccc46ac59b74495603d01538e..83d49576c665e1581c7b4d00a810f4d5bd33d277 100755 GIT binary patch delta 151 zcmdn-k#WHXM$URyA2%jOMhE7JobpTyN;Yae(%dYfHGzqPy@7jwG;izv%}aE?II|gA z=o#o4OkT(+H94R#L$Je&0R>o0-dHHU`9Yx#6R)AG5kpR5Qff}Tft&MY&64%ZjN+3W n3q>ctD_3G#P%v4j!jAJn5kz+ZSVF&?b8>lwBxAtji52PqcyTQT delta 282 zcmW-XF;Buk9L2e6L{t)H7uXb{w$##I7>I_zVEhjMT+3ZSk6!7ul(^vH4!JL&i90b} zjDvg^Cr3?OoV5Dh^n1T||AC%AQEA@0>}v>}Z7xey4JS{HxB7PRc6a?#zFTK|jfIvy z9({G^rg4(DcBpd^_%wo$#KVA$DM$hFki-y$V+kG!Kqg@*qBwte*Tf$Me(os@sGImO z3YY*UA|RZDkPt?KjX0IMj69w5>G(KbuBaFPLycJmc9`v2W)(Ao+N|ZYs7pLdFmX(4 zK-{KzIxuWnR}W}kRi<0&HFOo8RCaLTFU7*5owX+yTDjk~3Y@rOjPIf03Ty@7kbHdpKZ%_=%u^w~hKW-*7qxj^;Xwk|4 h5|x-9L`_yovSZ|!oS0~tM$URyA2%jOMhBLOobpT_@f)?0G&$HCxc8suYTduNM{@$xW)+<+di6#I zdO)b4p`VkVm!4V_UyxXmk(Zd88eg0rUy_kn5?@r1TO6O9m=|AMT2PQ*R033yoSm4S znU@ZfPmNDWO)5=~&&*59k59`licieRiO(;NFDS}S%gjkF)-Ntf*3Zc)%hd&nWThsT zfH?|jCI%MfX=X_#M)DSE2B~Igrsk%pNr}l8i57|GMkXnVNrpzoDF$Y#nv)kYN=;Ts zV4M6e%1i1(3Ihtz1@qV$1gthEMcXs+I-8p?d9oyb3vgL5fB-wg1gp)zqP>`Soh?lmauSnLbK(sQT{j2Dt!HLjKbbMfobktG V%Orayi@3>ENp_4YChtsA2LP%@VqyRQ diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtna.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtna.co index 437e5f0fe1d883b85983021b79289aa783434379..1212d40271496567aed2bcc7578d9a38413d2538 100755 GIT binary patch delta 140 zcmX?bnQ_G>M$URyA2%jOMhE7JobpU7Qa5U~X>8u1F@cGLy@7lG53bhzn?1B0%-IYr z^bGV2CNE@^n(UCwA*f@?fC6|XJ0^>7E=YdB$Y*Hb#E_Gil$sN7;9}soIWDz{nNfW5 f!whq#2}zTsGVK^0CP!x4b8bk181QlO!gP560z@ZY delta 286 zcmW-YF;9a)9K|^_saWG`;$S9_ntn{+ZWQt$7d%hLjK~t)=_=_x^<&3wXWLJhxYZfe$nsL zdbsVoW6Rua+|>;s(~#wm;WA2b!k~~)M!1ALOA4Skg<_OtGB2A2y})6dhE2~PU}O|R zoN@_V=75C&SwNOc9t&0&MNSPNE|Z;xoHJU^Lzmm8=W>VGR>$Keb9mqFGlHpyJ?vU` zfJx8V3rvS?R*z^Mp=w6fmRFR2wG?~5{vbbSbzmMT0uv^{?0JLwJ(w(@=jt?aRh?Fo W=m33Jv*=J8viVQ>t!~2&r2GM;YF|$P diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtna_group.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtna_group.co index c76fac64ee613c2f2d3f30237104c26ed13c08c6..95c9a4a1f30cddd0f9bca4fe6c08fa5fc1878927 100755 GIT binary patch delta 142 zcmX?bg>l7YM$URyA2%jOMhE7Jobrqs8#VuHY&OuGz{J7cz`g$mSL^=G2eci`*$gf8 z4D<{pFJzROY>>e=d0(=W;1Wv)6d*qNVzT(Al*^2KhK`mDIf+TBIq?QAMi!eNrq(kv iice0=G-u?PJTudtX-CTBTbXv88mSQde9zcDBnS66F-S!-<#g=z4!Ww>Mx{Dj?Ye72o2{S)gG!pj9-1BecPEnb?&E?ZalQA zdTZ!SY-^{nrkh-(5zQgPWt`#>g+f3X;}Y_0Q~<##6yq!tdD$$;HI9-rYI+JG4aN~9 zDHFg%4w!Qg1!TEk38#fwW5K5Y(|`Kmk0H8#Bc>Z^(SW$Y*Hb#E_Gil$sN7;Ns@Ed0uuCGo$!q g#e8$73mKE6^6i)!vL?^Ux8wYf1u@~{pfQ1IvxjzrZM~6! z9uO*M=;!3;rKc9f7bKQsk*FXD6m- z=A{GWQ{z)olScE{a&>_sS*gh- zV2(nXiGhWAnpu*Gk-SBkL8@7rskv!tQev`2qD7*)kx5EolA)1tih)_G=H!KpQjCm~ z88gMXco;x{ox#L%Ggsz2Mm`reM~0llq|}^v149d!&1~6C%#7RUj&G delta 311 zcmaEHpRwT~BWJyKiQG&tBBxcB$)wC>-mp*ewR^8xJ!+j=7d zJs?!j(9g-wOHVC|FGwuO$V<#kjW5oRFUd$Oi7zV1Esjr4%!@BBEhxw@DgmlU&Q46v z%u5H#r^ctGCY7egXXd5l$EW2N#V6+E#OD{s7Zl~EW#*(7>lYU#>*wT@<>~@OvQm>v zz#N4%69WtLG_xcVBYBH7gH*FLQ*+bQq{L*4M2kdoBa@WGBts+P6a%wV&B+THr6wn2 zDomb{DZnMp00Qg`Yb-a<%6!Pk=W1xdkdv5{niFqe=;*Y$GrOLdasA}LOwq}5`AUor jCOhWab56(xYhow>i^u12PF|ib$;prd5&a7jWn=&V99w3a diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtz.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtz.co index a310495583dc1f74510697a1764ae4586deb2950..9148e582f65022423226e70ae81d3672d62e8ecf 100755 GIT binary patch delta 150 zcmdn7m2trqM$URyA2%jOMhE7JobpTy!ZvEPX>8u1F@cGLy@7lG4vyCSn?1B0G}sI+ z^bGV2CNE@^n(Pq5A*f@?fC6|XJBEmFE(m$R$Y*Hb#E_Gil$sN7;OOkQIWDYlDLM$URyA2%jOMhBLOobpT-;TyHuG&tBBxcA3!w(j4&L1O~bW)E!#jd~*k zJs?!j(9g-wOHVC|FGwuO$V<#kjW5oRFUd$Oi7zV1Esjr4%!@BBEhxw@DgmlU&Q46v z%u5H#r^ctGCY7egXXd5l$EW2N#V6+E#OD{s7Zl~EW#*(7>lYU#>*wT@<>~@OvQm>v zz#N4%69WtLG_xcVBYBH7gH*FLQ*+bQq{L*4M2kdoBa@WGBts+P6a%wV&B+THr6vnR zuuVP|EX2jb00Qg`CYGB|1;1nDb98oO$Vp5}&51WKv~bycEVPQ5as6b+NOLBQkjYh% Xc8oVB?~JtP^aum%V|YKAF+v^y_ncmM diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtz_group.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtz_group.co index 9bdf92bdb8f1ece96519a094a49a9e629a1cc766..d7ee230d66e3636a34e51f9be60ea40902f62730 100755 GIT binary patch delta 142 zcmX@Gjd8_RM$URyA2%jOMhE7Jobrqs8#VuHY&OuGz{J7cz`cJ5N9+F02echD*bFW7 z4D<{pFJzROY!Jydd0&W>;1Wv)6d*qNVu<*r(94W`hK`mDIf+TBIq?RLt`?gghSf7O iice0AGH2wNJTuCkX-DYfTTynL8etIqzb7k3$^!sjohS(a delta 296 zcmZ3nmGQtfM$URyA2%jOMhBLOobrqZHfsLY;9zgy-XFu+x_`5V<^-nA2echD>WvKa zfKWk0KPNvgJ+&ylAh9GPFEKYYzBoU=BqOmTzNjF#I6gTsFTS|6pdi1f1gIi8J25>o zFC8eK8lRGyRGJ>2nU|IypO#+~pO}*qpI;nbP?VpRnUh+qUtE-|pOaITs|ys#N=+^S za}?4{3@psk%#uuuYK1lSqYSZ Date: Fri, 21 Nov 2025 07:12:17 +0000 Subject: [PATCH 08/10] fix qseq >> kseq error MI300 and MI308 --- .../MI300/fwd_hd128_bf16_causal_rtna.co | Bin 29640 -> 29648 bytes .../MI300/fwd_hd128_bf16_causal_rtna_group.co | Bin 29816 -> 29824 bytes .../MI300/fwd_hd128_bf16_causal_rtne.co | Bin 31240 -> 31248 bytes .../MI300/fwd_hd128_bf16_causal_rtne_group.co | Bin 31416 -> 31424 bytes .../MI300/fwd_hd128_bf16_causal_rtz.co | Bin 25800 -> 25808 bytes .../MI300/fwd_hd128_bf16_causal_rtz_group.co | Bin 25976 -> 25984 bytes .../MI308/fwd_hd128_bf16_causal_rtna.co | Bin 28976 -> 28984 bytes .../MI308/fwd_hd128_bf16_causal_rtna_group.co | Bin 29152 -> 29160 bytes .../MI308/fwd_hd128_bf16_causal_rtne.co | Bin 30576 -> 30584 bytes .../MI308/fwd_hd128_bf16_causal_rtne_group.co | Bin 30752 -> 30760 bytes .../MI308/fwd_hd128_bf16_causal_rtz.co | Bin 25136 -> 25144 bytes .../MI308/fwd_hd128_bf16_causal_rtz_group.co | Bin 25312 -> 25320 bytes 12 files changed, 0 insertions(+), 0 deletions(-) diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna.co index c926a2adad985ca755210cb2e44801568abeefe9..98714c79d528be27e301db15265460eaffe58719 100755 GIT binary patch delta 297 zcmX@{obkeQ#t9mX0TVTsJAUzC00S7!AOPVr0x32i{*evg|L8`g3m{wuj>&?I;({2e zCR;M9D`JR3&4BCO+{nnB#V9bO&53y>j69|cIf+TBIq`-@#+I8O=DRa7eesywm?JvbzDSZ2?&QgdMdm^o tJrE-@av<~!=E>ScvXhS&Nit?kepqDB*f3eK*q(942tz8Fj=zLT<`(ZDG#9DQ<$7sY|i*#^2}m;rXM+zZx!1yT1=KKv1eQ{IkH3@ E0J6qLt^fc4 diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne.co index 8542efc6ccb13531793853fd81d9791d19be1f68..490ad85ce4b7a4dec1e471e379e3b1ff54423236 100755 GIT binary patch delta 297 zcmeD9!Z-nlG#DpL)Liap;?DpEFq%OC!e<0hY(Q*L1mRmuLZu5JT!s^q1sTN!F;q>q zWK>tg5QUln*Sooq(Yb(8c=FLg9hL@`{SA}f78*0kZ&obwXJQnYTv#G5snP_s2TJLJ z+{MJfV6k~&NeLs5DMLC)ZaAWFd0~aQ&A9Qj&4SWW`E*#siZRf#i$H3xQOqgGs1#0fftNVX`2jxFCkA z$(D@jiWs6$GvInRH!?aGFbYmSTBtMmXQ4i$;%3bveNuEYXONjZu|%BF zVDrk75=I_lhMdHt)SP%jV`uZp4-|EP*kE&O*=t5doymn2`kZhFPhMGJF1P?{^#Z7O g5+;AFFlSsaS+mlfamVCLAo*bON+209`DLX#07;2E%K!iX diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne_group.co index 4a56a4a11fabdb40f0dede13ec9aaf3d1045d441..a31b28bdf8033b172b41cb89554a2ab73c8e33c5 100755 GIT binary patch delta 316 zcmdn-mGQt=#t9mX4ihz(J8to300S7!AOPVr0x32i-cbbM@0f&27eKfS9+L$b#RV}` zO}1oISHuv7ngQ3lxsh?VGGp{+TUB;nM&Ze>MLH}EEc+WK=N1_=%1>Tc^qg_c=FZ}J zM!_jfQ0t)7rOA!O;*$+ZIT&|Lwk$o*=rozLY(1|jLr!8+YEHbNv8yG+WM2it&ELwD zn3%TsPj)O8ovdFe$qDzxJFf9(00S7!AOPVr0x32i-cSVLZ4C$B7eK3StwVDp0F0!GH3 z$%3Wgl5?7%HbN;EkPDeO7&c6HEIrR?Ia#o5J&!R%PGVAOPQ0P9tNG-AWk#F-l_@bX zu9>V@sm}@bz~snEb3uj45MvaIA@qaE3oFeT6(--Tv}d%KELmmGSTQ*gNWPdnvq~KR DduT<1 diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz.co index 0c367eebd6e4a37d76a6a935d43b48824b93a8f2..95c3636f1bb0dbcd2c8ba635038f25ef49f8b106 100755 GIT binary patch delta 297 zcmX?clJUYx#t9mX0TVTsJAScb00S7!AOPVr0x32i{t*M=|ENKw3m{wuj>&?I;({2e zCR;M9D`JR3&4BCO+{nlr%_uy%HAaV}fn|Tg7)=aTyTrfEkND53|nW7E=7&SsH diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz_group.co index 244d372e99b193a6c40ee6b670f90212d0b4d4d0..5bf592a7ac2edea544c3b2f81c54097ffb3ac9d0 100755 GIT binary patch delta 318 zcmexyim~A|;{**RhNOvF%N$Fr7{CBVGYCNVj6jMFh$~_t{EAvsx&Xpu=$I_XC@zSh zYO*Dxx*~=s)C{=Z&5ew^l^MA=+o~QnXB3{y8mq(7z_Pz#vTm#~WB%mC*yoHnn?2*| z83jXfq1HjEOOqYr#3z4<=U}Xu{4@SMqtoP*3F~=H8FCVnQgh-BTrDjbCi^NFY~Grv z%*0e;HJLF^bn^RTNlv&gCJUyR3%!84=LOV{5=@hCC(BMQPmyGNF?nK&JrhIR z&g6&5`kZhNOqNVB7kmJ9$^)qP6ecI8m@__@JTt|f=|}A3TPb#o7Lz4Y?HN~0j!abt E0M4pIh5!Hn diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtna.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtna.co index c0a20d1ecafbf647c7c769cab3b429cad51be9d4..b663653c6a12c80027fbd2dcbdae23b61b18682d 100755 GIT binary patch delta 295 zcmdn+h;hdw#t9mXJ0@x_cZ_jk00S7!AOPVr0x32iPRNAt6FO1p0tlC3!DK;3aX}1K zlPww56){AiX2A7sZe+|(XB3|NHbaM{fn|TgWYJ7x#<0zvnf^?YHkD8tpww|N&CcL6 zd1JOXW5VW>*(HoTZVWkzNvS#UhK82LlOHJR0I|X5rMa&e8Dl0-%-83HyL$4;d~?AI lQ2Q@H{bVp%u)v)0!eq|^d&U=&JAtIYv@YBRWy(0tlC3#biN7aX}1K zlPww56){AiX2A7sZe+|(XB3?LHbZB!Vx~T0#OA~+#Hze&cqbrHu+!GP6M-bh^JKk|)00S7!AOPVr0x32ievk>_Kj=iI3m{wug~@`9;({2e zCR;M9D`JR3&4BCO+{n0JnNel4qpG$GqwwV1OdXa6mi-Nrr)C;6hD|=1`JC~}=8akP zjEs9G8|H{ho~eXd3Z+~?j%4CscrZCJ=RBk7WW(I`JZ=m*iAkwB@rH&@#*^jpj5f>V zDKRnLne14g&k6U#^P00S7!AOPVr0x32izL5#x-{?f83m{wujmd(H;({2e zCR;M9D`JR3&4BCO+{n0JnNex8qpG$Gqu}J+Or6OKGxZrGCSS~a&Uj|?&a8Sy$t{&o zo1oMsFwM?zWO87RIOC1UnK|bfjVD{?uIF`O$Vp5}&51WObTVX^?5kk7SuIbAiRp^l z5&DYcd$)&M1X653ydWRKU(koE~D^d-FzLE2A2H|lSA{38INu5%=c%Ktf`0E0HuzDX?BL5 z$sY^F85eBkEGl8-abw6yOiIm(H#9afp8P;j2Z#+eKP`UE$T(;6#ZrAvxT_~~mYE9* oOn{gr0Cm8J$$@3&i~^H8%j_97CZ7b70h2k)?HLbD_AFNi02@?7FaQ7m delta 292 zcmezIj`71g#t9mXA0}!pcbwwG00uCcK>)&M1X653JR={%pV5y>7eKfS8IuJW#RV}` zO}1oISHuv7ngQ3lxsh>oE~DUN-F%(NiTV1BCpJ&a_h(`(nf$O&T(YGeY6X##Ji>~ENSHQ$)=*ksOv=ZqqoKNi$8 zGX9xdSR^jV(*SV;jC27xl8J*sVe-PF^NgmG3yatDxH04;CZ*=Y8ycG!PmU`w+8kG+ z#Kb5wxv@;26Yht}JIl-kCqP{@0qUHB$&BUZj1wkXmfJI~m|O`YUrgQ!BqJvOELR5r DTIfPJ delta 315 zcmZ4SfpNhH#t9mX3npqVcNFnu00S7!AOPVr0x32imMDPmB_^QKP+^7_lLZ;Y1u;}j zwq#US#1Ms=0oS{^k#WB=r8%_ug`d5vS7h;MxM<-3+fpqztlr4 zhLM-R40Z;V$rFpj86_sKEIQ9X(dWbOd`IM zHx`Iao?j-(3HQY0gJtGI9Z=_VK;2WpKe@Y1cCvc8BxA>9$8vkd1(O?rmB H#tL-+g7QN? diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtz.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_causal_rtz.co index 6a470f5a0ea1cdf1a9391cb28f4801b428b349f5..df9336303de51effac9c188a5bff071c5f646579 100755 GIT binary patch delta 295 zcmdmRgmK3a#t9mXJ0@x_cZ@M-00S7!AOPVr0x32iPKbi=6RJ?@0tlC3!DK;3aX}1K zlPww56){AiX2A7sZe+}kU=*JGHd2SBfn|TgWYH*NMzPJFQT|MlHd#;`pww|N&CcL6 zd1H(?W5VW>F(r&VZVWkzNvS#U1}-kflkY2P1F`<*#c^*L8Dl0-Owi|qyL$4;1arX) lQ2Q@H{bVp%Fwvax!eq}xd&U=&JAtIY!GP6M-bh^JKiy800S7!AOPVr0x32ieh>xWKd3^b3m{wug~@`9;({2e zCR;M9D`JR3&4BCO+{n0JnK5OvqpG$kqwwV1C>@ptmi-Nrr$!kwicLNl^_=m_=8e(y zjEs9G8^(%Dp2>n*3ZE_Pne3RT&k6U#I!GLOrF#*vKdc)*{iRO#}lYb`KGiFS-OtNQOF}V^*N=)9F Gqz(Xo?nfp7 delta 315 zcmaEHl<~n)#t9mX2@^G!J65>R3m{wujmd(H;({2e zCR;M9D`JR3&4BCO+{n0JnK5azqpG$kqu}J+D4od*qx2aiCSQzt&Uj|?&ggnZ$t_t> zo1oMsFwM?zWO87vIOC1UnX%^?jVD{it><-N$Vp5}&51W~aWiC??5kj~Sv6jniRp^@ z6@~DOw=@AW~cd$(MPL!SeJW-O-V=`lsJ!8UTM Date: Fri, 21 Nov 2025 07:18:21 +0000 Subject: [PATCH 09/10] fix qseq >> kseq error in MI355 --- .../fmha_v3_fwd/fwd_hd128_bf16_causal.co | Bin 30504 -> 30512 bytes .../fwd_hd128_bf16_causal_group.co | Bin 30680 -> 30688 bytes .../fwd_hd192_hd128_bf16_causal.co | Bin 46760 -> 46768 bytes .../fwd_hd192_hd128_bf16_causal_group.co | Bin 46936 -> 46944 bytes 4 files changed, 0 insertions(+), 0 deletions(-) diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal.co index 701c5ed6fe408be079753db64c19224fad7cb3df..dae2c6e62f1ff32079ce763400521c845aa63639 100755 GIT binary patch delta 305 zcmZ4Sj&Z{~#t9mX8zyQlcNFkp00S7!AOPVr0x32i7RZP21^Q9x0tlDk!(>55aX}1K zlPww56){AiX2A7sZe*0tWfY#gFi(fIfn|S##^h6ZMvPZC|IORX#Q0(I#zJvPhI*() zP>L7iFeVNLfz1yK;~06I8FCVnQgh-BjSUPYuTc!zJh%8YBa?v7WWxf{$@5DkIpOY} ue6ZA9umS3V2B@zZpz1g$tCvYKHcWObvu8Az+z2E!CLaWnGbS^Zs{;T50X{hZ delta 304 zcmdn+j&a31#t9mXD<*0#cjWM400S7!AOPVr0x32i=E#TeIr>rQ0tlDk$7Df9aX}1K zlPww56){AiX2A7sZe*0tWfYveFi&Ul)jWO1>zi5gH!~T&sDl~~r8a_Tc7`8l;=CY7 zGI20)Y<^i7$H?Qzkdv5{niFqm60RCy-Q_d=f}bn9NzO4gjU{HxmE= diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal_group.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal_group.co index b281ebcd087ccc5f8b456b9e836c133f9434d155..81a4dc650e6472de44b2e04420d8543e952239a0 100755 GIT binary patch delta 324 zcmccdp7FtZ#t9mX2@^G!J5KOn00S7!AOPVr0x32io{$gWPv}Rb3m{yEg2{r6;({2e zCR;M9D`JR3&4BCO+{n0HnQ`)FOV#I|jKY%@^L1DoSoSw)O!mq*V!XP!HouyYv19Yk z0)Ixqf_kWhP-@TQj|JkB3yL@xCrqv^I?re{*|T^ik26C~Vp3{OyrHp?!RB|xUQA3A zd?s%!5S{E_Cdmo+!sNm-b1sGnFds~86q|gYfDNdIbMkedItHlw)`NBOl`AoBm~2>X Q&v;;RA&@kfys=yz0N znrz9au81KDH3P19b0g#SHH<=&E!U~AHn8k(@R%I6&WiEF=BevuGcmSIHryyKIb$Ex zA}GZNau^c_!-~y`8-o~mT@4v>5|dJM;*E@4%{Cv{; q(JrwbLcf^IwN-BN@~x7L5|a;ZwP!prnQ@yvP^Tl1l$hMOO&tIlB1J*~ delta 287 zcmdn+mTARXrU@F18WS~_J1*#E00S7!AOPVr0x32iUa$_rUvLhUE`V?u5+(~WiVI?> znrz9au81KDH3P19b0g#SHH-q2E!U|`PF-it_-XUpb+efyYxY46hf>SHG&{qD$$=Zi z85eBM+!(~j>*B_clbDp66K`bXYBG6`V%%o6&94|47fcr1s?P~`?PSlb=7Iv}AqEMo hhtMA;Puyw_wD#myd&UEkIk(vZb$J3wfytfQ)B#3Zchu-%00S7!AOPVr0x32i)>seWYn(@=3m{yE8LMxn_&*Qu~JuftvwFl5L{OiIm(H!?ObW0-tS z!D#Y#Mf1&PH@{|N(&(AIQBh>_xvf&1a34+nxYb;62Gngcp#IH(s^gj*zD<&G#^l0n V_KY5rHv&nA$sd8_hRKH8)dAvnQ?LL4 delta 327 zcmaFxj_JlbrU@F1HzsN>cU0(M00S7!AOPVr0x32iR#*?=E1XBA3m{yE2a^RE#RV}` zO}1oISHuv7ngQ3lxsh?ZGGqB>OV#>LMuEvY*Qrc?y3U;O(`MQA)r^y0Y!ukMVuJ>w z;h%j_%b?UoFwM>&upd=i7363p4hDtEjGN9gnoYjAsfW+V&5a=^F)1}C-pJU%Wb(Yt z#+&DDR$^jQn0#=nJ}2B)lYeeC7n}ff-vp?i3nn{mGiRJIxpJF5qr>E#K+ Date: Fri, 21 Nov 2025 10:07:17 +0000 Subject: [PATCH 10/10] fix the MI300 error --- .../MI300/fwd_hd128_bf16_causal_rtna.co | Bin 29648 -> 29648 bytes .../MI300/fwd_hd128_bf16_causal_rtna_group.co | Bin 29824 -> 29824 bytes .../MI300/fwd_hd128_bf16_causal_rtne.co | Bin 31248 -> 31248 bytes .../MI300/fwd_hd128_bf16_causal_rtne_group.co | Bin 31424 -> 31424 bytes .../MI300/fwd_hd128_bf16_causal_rtz.co | Bin 25808 -> 25808 bytes .../MI300/fwd_hd128_bf16_causal_rtz_group.co | Bin 25984 -> 25984 bytes .../MI308/fwd_hd128_bf16_causal_rtna.co | Bin 28984 -> 28984 bytes .../MI308/fwd_hd128_bf16_causal_rtna_group.co | Bin 29160 -> 29160 bytes .../MI308/fwd_hd128_bf16_causal_rtne.co | Bin 30584 -> 30584 bytes .../MI308/fwd_hd128_bf16_causal_rtne_group.co | Bin 30760 -> 30760 bytes .../MI308/fwd_hd128_bf16_causal_rtz.co | Bin 25144 -> 25144 bytes .../MI308/fwd_hd128_bf16_causal_rtz_group.co | Bin 25320 -> 25320 bytes op_tests/cpp/mha/smoke_test_fwd_v3.sh | 4 ++-- 13 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna.co index 98714c79d528be27e301db15265460eaffe58719..49492530a7f5ce45822f1747870a994ca48f9ee0 100755 GIT binary patch delta 16 YcmcccobkeQ#tmDu88tR<%a-B*07-TSU;qFB delta 16 YcmcccobkeQ#tmDu88tU=%a-B*07-lYVE_OC diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtna_group.co index 01fe991393825683df8f7cebd8fdaa4823a2cc54..3497784a8b1c52301c88fb333ffc13bfaa26b471 100755 GIT binary patch delta 16 XcmZp8$=L9caf4|NqsC^l94`(4I)?@J delta 16 XcmZp8$=L9caf4|NqvmF_94`(4I*kSQ diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne.co index 490ad85ce4b7a4dec1e471e379e3b1ff54423236..b12af28c734eeece0a27828b2fcfab6031db51fc 100755 GIT binary patch delta 16 YcmbR6g>k|c#tmN!88tS4E0p2@07n-Gh5!Hn delta 16 YcmbR6g>k|c#tmN!88tV5E0p2@07o4MhX4Qo diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtne_group.co index a31b28bdf8033b172b41cb89554a2ab73c8e33c5..5c483e167d91110c09427dfa4eb170f991afd6d7 100755 GIT binary patch delta 16 XcmX@`mGQt=#to%Kj2fHEio7@gMvVs= delta 16 XcmX@`mGQt=#to%KjGCLvio7@gMw15{ diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_causal_rtz.co index 95c3636f1bb0dbcd2c8ba635038f25ef49f8b106..9c984699882e96946a49a3e8aaf193da3608a359 100755 GIT binary patch delta 16 Ycmca`lJUYx#tmCz7&SI;i;?0007a(<1^@s6 delta 16 Ycmca`lJUYx#tmCz7&SL