From 7d417a5957c6fdeea06fb08bbf050e3e0a1f7a20 Mon Sep 17 00:00:00 2001 From: antsaukk Date: Fri, 7 Nov 2025 11:29:02 +0000 Subject: [PATCH 01/21] redirect asm_moe_tkw1 call to fused_moe in order to force kernel tuning --- aiter/fused_moe.py | 36 ++++++++++++++++++++++++++++++++++-- aiter/fused_moe_bf16_asm.py | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 40b265b539..2ad5ebd6e5 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -26,6 +26,7 @@ from aiter.jit.utils.torch_guard import torch_compile_guard from aiter.utility import fp4_utils from aiter.utility.fp4_utils import moe_mxfp4_sort +from aiter.fused_moe_bf16_asm import fused_moe_stage1_tkw1 BLOCK_SIZE_M = 32 @@ -104,6 +105,8 @@ def fused_moe( num_local_tokens: Optional[torch.tensor] = None, moe_sorting_dispatch_policy=0, dtype=None, + a16=False, + per_tensor_quant_scale=None ): if not block_size_M: block_size_M = -1 @@ -125,6 +128,8 @@ def fused_moe( num_local_tokens=num_local_tokens, moe_sorting_dispatch_policy=moe_sorting_dispatch_policy, dtype=dtype, + a16=a16, + per_tensor_quant_scale=per_tensor_quant_scale ) @@ -148,6 +153,8 @@ def fused_moe_fake( num_local_tokens: Optional[torch.Tensor] = None, moe_sorting_dispatch_policy: bool = 0, dtype: Optional[torch.dtype] = None, + a16=False, + per_tensor_quant_scale=None ) -> torch.Tensor: device = topk_ids.device M, topk = topk_ids.shape @@ -178,6 +185,8 @@ def fused_moe_( num_local_tokens: Optional[torch.Tensor] = None, moe_sorting_dispatch_policy: bool = 0, dtype: Optional[torch.dtype] = None, + a16=False, + per_tensor_quant_scale=None ) -> torch.Tensor: # We do such convert since custom_op schema restriction on block_size_M, and Enum type activation = ActivationType(activation) @@ -236,7 +245,7 @@ def fused_moe_( moe_sorting_dispatch_policy, ) - if metadata.run_1stage: + if metadata.run_1stage and not doweight_stage1: assert ( doweight_stage1 == False ), "doweight_stage1 not support in fused_moe_1stage" @@ -260,6 +269,18 @@ def fused_moe_( a2_scale=a2_scale, num_local_tokens=num_local_tokens, ) + elif metadata.run_1stage and doweight_stage1: + return metadata.stage1( + hidden_states, + w1, w2, + topk_weight, topk_ids, + w1_scale, w2_scale, + a1_scale, a2_scale, + a16, + per_tensor_quant_scale, + expert_mask, + activation + ) else: return fused_moe_2stages( hidden_states, @@ -611,7 +632,7 @@ def FinalFunc(): logger.info( f"[fused_moe] using {'1stage' if run_1stage else '2stage'} {'default' if cfg is None else tag} for {keys} " ) - if run_1stage: + if run_1stage and not doweight_stage1: return MOEMetadata( functools.partial( fused_moe_1stage, @@ -624,6 +645,17 @@ def FinalFunc(): ksplit, run_1stage, ) + if run_1stage and doweight_stage1: + return MOEMetadata( + functools.partial( + fused_moe_stage1_tkw1, + kernelName=kernelName1 + ), + None, + block_m, + ksplit, + run_1stage, + ) if ( "ck2stages" in kernelName1 or (q_type == QuantType.per_1x128 and doweight_stage1) diff --git a/aiter/fused_moe_bf16_asm.py b/aiter/fused_moe_bf16_asm.py index 81df5ea592..2167a9e091 100755 --- a/aiter/fused_moe_bf16_asm.py +++ b/aiter/fused_moe_bf16_asm.py @@ -8,6 +8,7 @@ from aiter import logger from aiter import pertoken_quant, get_hip_quant from aiter import ActivationType, QuantType, dtypes +from aiter.fused_moe import fused_moe BLOCK_SIZE_M = 32 @@ -263,8 +264,8 @@ def asm_moe( # fc2_smooth_scale) return moe_buf - -def asm_moe_tkw1( +# TODO: move into fused_moe.py when module is deleted +def fused_moe_stage1_tkw1( hidden_states, w1, # [expert(local_expert:EP), inter_dim*2, dim] N,K w2, # [expert(local_expert:EP), dim, inter_dim] @@ -279,6 +280,7 @@ def asm_moe_tkw1( per_tensor_quant_scale=None, expert_mask=None, activation=ActivationType.Silu, + kernelName: str = "" ): E, model_dim, inter_dim = w2.shape global_E = E @@ -410,7 +412,7 @@ def asm_moe_tkw1( a8_scale, fc1_scale, fc2_scale, - "", + kernelName, fc2_smooth_scale, activation, ) @@ -418,6 +420,34 @@ def asm_moe_tkw1( return moe_buf +def asm_moe_tkw1( + hidden_states, + w1, # [expert(local_expert:EP), inter_dim*2, dim] N,K + w2, # [expert(local_expert:EP), dim, inter_dim] + topk_weight, + topk_ids, + # following for int8 quant + fc1_scale=None, # [expert(local_expert:EP), inter_dim, 1] + fc2_scale=None, # [expert(local_expert:EP), model_dim, 1] + fc1_smooth_scale=None, # [expert(local_expert:EP), 1, model_dim] + fc2_smooth_scale=None, # [expert(local_expert:EP), 1, inter_dim] + a16=False, + per_tensor_quant_scale=None, + expert_mask=None, + activation=ActivationType.Silu, +): + return fused_moe( + hidden_states, w1, w2, topk_weight, topk_ids, + expert_mask=expert_mask, + activation=activation, + doweight_stage1=True, + w1_scale=fc1_scale, w2_scale=fc2_scale, + a1_scale=fc1_smooth_scale, a2_scale=fc2_smooth_scale, + a16=a16, + per_tensor_quant_scale=per_tensor_quant_scale + ) + + def get_block_size(token, topk, expert): token_per_expert = token * topk / expert support_list = [32, 64, 128] From 81dad6c049f59dc41db68c577bcac90b24742b9b Mon Sep 17 00:00:00 2001 From: antsaukk Date: Fri, 7 Nov 2025 14:27:20 +0000 Subject: [PATCH 02/21] add required keys to fused_moe_1stage_dict --- aiter/fused_moe.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 2ad5ebd6e5..4917f137c3 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -26,7 +26,7 @@ from aiter.jit.utils.torch_guard import torch_compile_guard from aiter.utility import fp4_utils from aiter.utility.fp4_utils import moe_mxfp4_sort -from aiter.fused_moe_bf16_asm import fused_moe_stage1_tkw1 +#from aiter.fused_moe_bf16_asm import fused_moe_stage1_tkw1 BLOCK_SIZE_M = 32 @@ -153,8 +153,8 @@ def fused_moe_fake( num_local_tokens: Optional[torch.Tensor] = None, moe_sorting_dispatch_policy: bool = 0, dtype: Optional[torch.dtype] = None, - a16=False, - per_tensor_quant_scale=None + a16: bool = False, + per_tensor_quant_scale: torch.Tensor = None ) -> torch.Tensor: device = topk_ids.device M, topk = topk_ids.shape @@ -185,8 +185,8 @@ def fused_moe_( num_local_tokens: Optional[torch.Tensor] = None, moe_sorting_dispatch_policy: bool = 0, dtype: Optional[torch.dtype] = None, - a16=False, - per_tensor_quant_scale=None + a16: bool = False, + per_tensor_quant_scale: torch.Tensor = None ) -> torch.Tensor: # We do such convert since custom_op schema restriction on block_size_M, and Enum type activation = ActivationType(activation) @@ -465,6 +465,10 @@ def get_block_size_M(token, topk, expert, inter_dim): { (ActivationType.Silu, QuantType.per_1x32, dtypes.bf16, dtypes.fp4x2, dtypes.fp4x2, True) : aiter.fmoe_g1u1, (ActivationType.Silu, QuantType.per_1x128, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_fp8_blockscale_g1u1, + (ActivationType.Silu, QuantType.No, dtypes.bf16, dtypes.bf16, dtypes.bf16, False) : aiter.fmoe, + #(ActivationType.Silu, QuantType.No, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_fp8_g1u1_a16, + (ActivationType.Silu, QuantType.No, dtypes.bf16, dtypes.i8, dtypes.i8, False) : aiter.fmoe_int8_g1u0_a16, + (ActivationType.Silu, QuantType.No, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_g1u1_tkw1, } } # fmt: on @@ -612,6 +616,19 @@ def FinalFunc(): run_1stage = token > 32 elif q_type != QuantType.per_1x32: run_1stage = token < 256 + elif ( + doweight_stage1 + and ( + activation, + q_type, + dtype, + q_dtype_a, + q_dtype_w, + use_g1u1, + ) + in fused_moe_1stage_dict[get_gfx()] + ): + run_1stage = True block_m = ( BLOCK_SIZE_M if run_1stage @@ -645,10 +662,10 @@ def FinalFunc(): ksplit, run_1stage, ) - if run_1stage and doweight_stage1: + elif run_1stage and doweight_stage1: return MOEMetadata( functools.partial( - fused_moe_stage1_tkw1, + aiter.fused_moe_bf16_asm.fused_moe_stage1_tkw1, kernelName=kernelName1 ), None, From cac3dc21af249b6f6d6c33d82070a99f17194af5 Mon Sep 17 00:00:00 2001 From: antsaukk Date: Fri, 7 Nov 2025 15:21:43 +0000 Subject: [PATCH 03/21] add kernel descriptors and code object files --- .../fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv | 1 + ...oe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co | Bin 0 -> 24032 bytes .../fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv | 1 + ...oe_bf16_pertokenInt8_g1u1_tkw1_silu_32x64.co | Bin 0 -> 23776 bytes 4 files changed, 2 insertions(+) create mode 100755 hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co create mode 100755 hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x64.co diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv index 9aebb8aec4..cce9c4aa95 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv @@ -13,3 +13,4 @@ _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x128E,fmoe_bf16_pertokenI _ZN5aiter51fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x320E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x320.co,0,0,0,1,1,32,320 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x512E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x512.co,0,0,0,1,0,32,512 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x192E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x192.co,0,0,0,1,0,32,192 +_ZN5aiter43fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co,0,0,0,1,0,32,64 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co new file mode 100755 index 0000000000000000000000000000000000000000..74cfb603152492d72ae86322897954ff4befc7ec GIT binary patch literal 24032 zcmeHPdvH_NnLn0fS?Eae16wk05$ps%c^V86K?WO4z+mGyj|kb8EeG2&w#=gm=317` z%RJ%`LX$M1JQ6|@LK8yB%U05~-I;|Jnr%yV(rspTJG+0hvzh63AKq^6e&0FwDp!gq ziNx-7rWSMkyWe-d-*?YF_tQD|>W8$iWZf!NT%27)_)ASbCvnm~f?e9*wu$d-h(})` z(ZOF6F%T`dk!$D8U&DZuOh%m)hi>_pk}DLZNT!TQ+(sUBfe_iB!~I0U+YfSEMxx+h zUvJk)?K?xUlbdAr1ZhX&slfauJee<_-^Sm1oACU;>G9+-uK*q%@7uocSZqA8!}tor z3l{E{Q6BpQaG!uSD_~Js@#^wwLVTB;?M?1(Xa06qSBI;u`-!3R`OdDE?$1d1;H7+L zqqoP|=5RJPy1KhBcDbBQj$K`?UKg}9HFbf{(cS56biLTp=4o)YozL%Zw!6N+;iY^} zb91-LOU~zaw|>W^dnJFTv#rPF*xuUF^coL!H1sq#ySfHAZ5aJ;*A^7?@Ytb5+es&cs= zuOE+ayu0`ER0P&N?&X=v+F&P7M>yQwJ9!}j>mGOVY-Pz;LwPyE@$TNsYY|v??4`Vp zHnp|~2PeQ2Xyi2$SKqrPP%OmPKaMrQ-^7~WdK3xHxs`)(Gl~q#yOHaE7i)ssu_pK| ziUj8yoZd!vuoOOzB1On3{8Ow6{w3A~zl$QlIagA^2GAQQ88*!;IXGo7Z!_+7;oGDfe zgEfi-q2-VfbB5V5XUL5r!;iet`D z8gqtq(PVHb-xuZ4WC)oas$$Mi7juR!QDis|1yP}RYix@mM`%Sf#hk$%bB4AkGMsZ2 zD+i)8iWH#(;f*=N?wB(?9z}-pE_ixXl!OpPj?j_t$DH9n%o&EF$Z*c<>D>O+&D8NI zQiKk~iI_8-iaEn`-zLL5`7K?Zo=!)%tEJu5;dL~(?shdf@I&68qLOfxeTobp9|r@R zZLKXGnzz2@N6L-3)7SFbox2^)ZBDOam#1sHl$^B@{?YKKiyfQR7dYWF=dOhdn%g}t zM?>?xMUGBam)Eo1)v>0-Tj*$+*E7%I-M(v{qs7(M<5)0%_o9U*@8x%PweEC!UBSM5 z)md^Szst1)KFba{=96%L92fUWes@b}m!r|s)8XCrQhxWI_6ASer%R*kMgI0Cc>WG1 zyrCkm=kIdvbTxN*VEAwp?D7ZjJ6&Dft)7lySsmb?;Vb`QONtsk#g}yusc{KlGAI{; z$=qR={1`89tjUUsMLF{=Dm4^TUd_Cv1`lrdl4S&LaW0AS#G~cGfXRoCfT3e!mvoy5 zfn+&2m#YoL4<{$(vHUZ;pxw(Rnm*+c~%JcjZFbM|T zoCY-lSOYj0@Fbue@Rxw)fMzq)2w)9hCm{CKN2)S5iW^}ZM6waCV>(%QD|-(im0~6H zRf2DlN0X` zYux>CJ;kazgAOK3w0$+23{?QGTZwHrzI55}GPsVm^mIX(G|886ocsO6o?)}w-2sS6 zC!dW>BEKG)O#WeH3i-`QNfE?s*&+yAxB67j4=7aufqAcY-iqs^qH0|)WEDl#dc=LAs=Qa+ z2X6*4G%6Yl_v8COG?^Is42BG!$&^8o%0Jf{69mKr6Gwfbj@dLxC;0LDNautx`Yfp} zo1-pS&tfN}AXa>yPU4}49vAR>pgl>p;r!5^EZZ<&qdi5oVVu#PD%8R6xU^uvDr|K}GfnDxp_U zsQ`U|8bCjw7H|+y2Y3Kb4|oXB05}9_0zBG_^>e05Yv46iW5`L+y77H<0J>$3X^19px>0>|S4jwgtQwV$$c zd`{qaI>7NqLvNmiQ>RcUGy8;_)?1++DjxL-RB{nE99u_MZB zg*J2u<8?E6ygn1+aXeFb9zz!QoAP)bc1@9|yryWwHT7eE=1nvqstP8W*!4(%T0mQ~ zg!{G2z%O5WN3D}TGZLNodl39F|Hsh+CC>Zx9; zr>%;5y2E>4fIjP-Qa!oC>Zw(#r*=g>VIE~Yp)J?b4ym4c!s=;{R8M;q^@KcSJ)tev zQ=e2%17Y=aP^zaRih4qxvYw#6Lh7kPs;BC(da9S|X)E~odph`DXZ7Ti>d6&WPpwit zwL=|pUT{vp>z}kks;8c?dfFq^(_ZlNIU+cBuzKo~>S-XXo(@X&bOh><^MZ4XvVNke zr#PsmUw!Et{g9j+`H-9+5#rt-*|nf`aX57-WOl5?@!pQ&nAC+EBjfE zCI>JFFc&ZvFb^;fFdO1?AU+r3^H_ZRt!${y91?#k2QU{f7cdVn4=}r^syC;orZ=~! zt~ak}Q!mb!_i6ftY!Y`X`?qnxi@TKrm}pt;&UNBkHvrc^T{xL z`}6Tv>*zqOo(|R<=>E+n>OTnfVX%*YeauAv67KEso-Lkg1W`4+_k+cQ1)?gOVXFF* zTY|b$gXl}B)T9CWX)VLS#99s9=Vyosa9<7g0{HAkik`Whoywo9@IC|Yd-~zIjuidw zb~Yi~1da`41+$w3j?H8xv)uy6M@ccW+XRjsq=ea>0>>`0irHR)<4&@g*}Daf-yx;U zeq7+lp5^fzw*6=$>$~$OHSC<4ZgAW6Gj6+o!R^*xal7r;+;0C3w>y8!?H&KjZSU{7 z-Se;9-u*{z@A(gIKmI@5-YbG#GP^geB)eBsRl$9C=gRD?+4}69Y<()sPpL3BrNX?F z3UiV`&iq_ZtBGJg^K)N&TkPx3dMf87%|s`|7&kee0m~T&Lud zHEdi3fuoOKBI4 zKS8ZWR2dh5A8OK|5srhcfp|o1{7M#oUuwT_0{sw=s5MraP66s54pAT9$l_+D_Y2RV zAL0=8#%|MTKm)`fn&SIeT*lOX;T-xQ4$)+M(xhC^3WdUf^}_mr4Z?I&mIg^@a&7$VaOlp4M0Xk-#Nof`2yKfey)qok9l-2;={{W@6kUuMn zT5@uzB`=R!=FFjEv$J@7b`Fov&ExU;b6C82-aKkvuz;Ei3aEMUVmdZ=9;J1VZ~g*G z>j4)QP`U|l(PBzBL;fX8sO7;2sb$$RYFWOVjukH9@k<}%@wR0={-Nb8-dt2f%_~<@ zb4dv`uU<{Z>_wDrfqW}gQo0qexP;P209UQ1)B*X|tf7{*YpJEIj9NBqpkt+Lc>Kd_ zdHlLE9>0DAi=RtMZyrcilT0GsI#8ii!8~&-bHhgj{Rby<|3UEQNd5!l_qTGYf6m$u zrSbMdVZ6f@?mw*H9aQj!Ch_*6Fy4_V+1_fTf*C)3*&w7 zLGJ⁡X(-J->{%KOe?>VLA7|px`~J;GHhw?We9?!-r3dM ze^$YJO2Ip~hPR&!%dF!&z@JOMZac=D??hg7uPhDSF$X<%4n<@O3Iw^!P^U2Nxe z$u@4U+Q#kGq?_?diJ##bE4LrEa(k_v+w1JyF5AZK_1n0;0Y3{)jr($h2&KO+x9dm- z>@&L$>>_TPrS^?06gISPT*>WpslBRLVMBXW3AZOp?KP_uHni8Q=Jr&ny{=SYLwnsC zZcmrmH$AMdp?%X@Zd;}HE$b9Ev~MZn_I*#1e)W@_2Gm0BDQI#yTD z<2P;Q@msd?_(vQp-t2Tzb7LblyIj=V(n80!IjI5H?uJHcz_q){MGd%iH@8p&uHCJz z)Utg$wY0ZWi^oI9+^szR(d|6Gt)0hrcv!r7#|~=l?xyCR9%|mTi;i{fpaxvKySk|X z*KThQHQ?I4a~Cz>+P!BFwLJD1wd~zXE&KM-vE6%k{C6JX@sIE2@xA+4yt%KBn)~~y zd0>E=_wT1;LLW8Y+U@J723))S1JrP{HSx3t@B4%az>!vSQA;pqQ&JR`d3YVZ2vrx&IXf?_~w=)q39k zY8dae&D{T*f_G8DyR?K<{(czmgU7i40|oC51#e_8ZyyQcjqc|5Bs_QLj~`Og7?t?Z~rKa_rv|%|3d}uEd}r6gS`FYFy4<2bN`PN zysHY{kB{*7ABXXNa*X?bqTszP@z~mZL0Y>nN^AG4(%OAVTDvbxYxf({+I>Y@yRVL~ z-2~RoztpuGKTnOkm6{Ri{hN%z4R)P_Pn zuLu2yjS*;tc4$W(2od$4NV*T|LD-J^k5czR9SHrX|Agy4=tn)M7!dUzrS5||5c*O7 ziKP3W9)#_v|0s1I)Pc~C`cJs-gMQS5l=n!{d|n?({|Tul(18RvmIm~nU_GHeqSO)5 zmUKid=tCHf`cFtbfewUz1L!}&dO}_V-3Q}Lpbz;03Fto|_0%KP6X-v|dcr(+(0#O^ z4`Ci1=szL#1UeA<4WR!7>k0FO={^`^0&OUyo=2g<`cEX?#|(N9wx@&s z6H-s01EGH^=s)4Q5BjHr9#jm7`cFtbfewWJRM3AS={{!AgRngv^q-J=0v!nbQ$ha; z*L~1G9rU1B>uFs7!TafP{byYN`TzgFUGnq#xcE`VYpT?j!3!chY|_4t1Z?uzsNqbPjM_{~6bR@H44#{fB>MFs}c6 zM8@@>(wpP@57$Tj+Vr2!|BkHxu(ca>A6W;I^&j~@Qr3ZF{fDjHp!>)=(4F)jwswQ= zBkMqS(tp_64Z2SNpFKeb`WSFr{~6bRaP1z~fB4!xuK#>Q#`T}lo8$V=U+3E0_g}L9 zga7xkZxp{8g|7>fI8qKIT|)dA9A68i??>yoNxTyN@k`NzyI)9r_`f>rl`6$cW)Rl3 zcEGQtF15mMs9M*pE4I#`n-Bl2v*)yVI$B&^j!vi7jlYEI==M0gZuo9{S7&>-qtV&n z=i<7oDDIh<{64o|nEv&+-m+UDw>)7{lL zr>$*gdmeZmbv1gKVQpSGw{UUuqK1X@3!CS<7Bv?vE^swC8w;I<&c*W=HaQ#S&0o+o zcabYA*zK?Fpj-jp?jtvwLz@he|Fd7!fcZT}w!Ho1h`6s|1>3i}dzr*8ok^H7T9q@I3lHbzN zlkawRyNUD{{oXE;-{om?dYvTS>)P#Q(a!ePMsT~@x;$`6o0{N)lKe(bJN`BT!JiKB zIvW7pz~*}#Nt?65)#jL6vTC8cTex_!ygP4IaCgD{xw3cRk^*_RaB+~fq$n7$VqQVC z-?@*Lhu_tA6xXjWDqF)ZUFc=_))v+yN=+b}@t3?Kq5ONsa^MQ<| zQXtL;cr2Y{%kxAc*#9ny9+@xC2Qp?z@p8M&m)r!W56msM%kzhfRw+*E9PlN-FXi9G#_y%$OP-JA@q@1dv)4oL7kx6nbQkxRO7Ht?GJ(6lcK&7H z$1cs4+GUXWugeYzIz_@9Mr!?>^~=w6A3SYE4p-Q%CqiOFkz_VjsaN_P5vZolVK;D{`8V+vO!OcthssoX{$41o~EKZpB?z#GfBtsqhHaI80~ zM7%o^JGe<^8^w;`X~6t8JcX}3zsdfcpf*?0`iZir19E`~+Tgw>5gZ-1$2^o$a31u15#Y<-0qZyFMdj1Fz=08~okw zR+qb>!PC`sq0{4TbnWhJ@p~ZB*w_g^S67F-!Sj4`tFPYOdM>}+-R602(+l~&rlu~B zpPb9@YWa@G@KXLRcWbxDwWFoI@fDuxs_$-U@^tod%9###qe3#oAf2z@>tZF8O8&ha zo--|r7Df;8MK(lVd+XkB9AT|;u+$g_yX6S8Vvuh42-hehw8uEuEk`&z2I+Q>uu2)> z{1^wj

L7kZ$(~iNV;t<3BP@+Uy3HdzU0bdUu`I^XZaKu-7^K@h#5Lg|ZjW)a zTMp41gLJ!xSR5W=M~tK0a)^6kkZ$u3&s1y(kMW8W<9N3ms3#}wKyuM??$ftBGCdj5-spq90ktR zyZsH`a4UQsM~R48_}4@W{ClDW{t!ojv!0Gle|QqUh@(QpB>ZQh1^$v~fxpI4;B;-5 zG6h$YNE`*!b8t0Dmv99`!WB&MR9I7`nupYQDyXiFSCcXlu8@^*g(-1VI9;ro274R@ zBHJN5;R>@8t}r)_3a2-zreS^@1tOOr+Jq}?jibUjXozyvOJjQ+H6lBrG2sf{ge$bhQQ@qoSUnLP zag>Oh2!FyA_9R^4;W#Rs^T4}TRZB>5)QFskV8Rs+BwS%IjtXb}zK$JV-Ao;iqeSFH zoJhFBlL=RN`fsW5PJVNzue-z5G>n#Xd!bkB`F%?$(y( zcHNs_^CRVI(y7b&ZSFm;rdGG#wcFRZLlkE*!ao}Rbct)r#sW8d=G?h>QB#}Gju@HCS>f zztgi5KFf|c=3{VwoRsuZephoxr>nu&-R|H1LVnlYwt8Rdr_16TMgERPc>f(PcvVAQ z%irzZLyT72z8iaNkQ!&m>sNV*n2#aDC?5nKwG2FgWf5_dR- zAM=$BL+CdWJM;aR9_#58c6yk=m}K(2z#)wn zaZHpaGBnF(hBRIAhBM?D3}?x+7^cZ-41vEN)iU5S1U^IHGXy?E;4=h1L*O%< z?)>xuT9>T}!Felj3?-K?A6gFQ(UFxUNfRdqQjT+fkT|m)PNz2nIa%bh z;fdro!;{Fr3{NJ%9WE(?oUL0WY1_7d2F3xcN|M)#Iwq4Ona#Y;YUn4jGTX6or8flo zj+Mn;NAYT}V|A%_HS*YN@2A7a-zdTBb@^pU=|RWr*^-o#6X4G!H9M4=?)|3cm&=}4 zMz)+`^bG$hgo2DA1Af6xboG{0jBVxH6 z4QWP}J1!lu{PQf53_bL)gy#e8iHZ%+5A8{c4cBY5Co4A0Gul%W8~@`2*i#i7|Dyod z(-b>}+tb087CkI2&B&0X%*+5u-TJwm3`iscPzRU=XaJl7Xc~}~iQbenWS^vwdL)eo zFaW3n3*;&?VCpc~n)4~}%eImUVt`O_yPO>05aWNIyJK2o2M(AG`le*IMRi*p5YN7UyE zZ5R;d8)oo)<1EO>`OM&TOgY?d$>Vj{IYplOoT3fqG>GF_Fu{VTDVShk=OgP032og{ z?$<8|zjE%8r#|;+!@0+NLlMt6mOwt9&ndi)X)X6#Ht;&EN06uP5$MSdG3QRGm71nC zBc^9+%}URpoiUO7ty8sT$8#A7UPAS z96y%J9PAuA%8esSrKSC9dzrMXUwN%4uZ1n*`b(0^-YzTGjXX}G`l(#>Q&m(y)ro%E zrs}7gy!Is+v%xL;$rII2Eux>=RQ-f?)cu6E(oZ`@KXpg-(_Ya}kEr?ydFp;bTj{4> z(NFzR{d7?D(-BoaAy3^;&|eY#R4)3dDypCAL_ci|=VfajcEyGj& zbqr7Uo0F1g8f<2*mZrgGPEMw2u$fa*Xc}yzB#DkMOOi)5S~|WQP?t={ZwAz-(D4;X zvSr*jnhu+VQko8%#cZbOuvx~Br|Gbb8pd(Hk#fGt%=wn_jBoChjA^i$`vS%^*v$Pw zV;XGc{R74{*hVGEn1ja;1dKU&{9w?SgU25jFy`R#4<0b4!)7^r$e0eB<;b8h9X89c zqsDaDMh_g|{6mL0e{hiVj~-=w)_btl%^-bJE=j&FnTdT4U@zcYzyM$#U>{&^QDx7Z zqUxTxMYTP7MO%9CdUuBoc}drEf1dA6zk43@bLu2c>2Y72E=&w-()=5roVkW!72mouQAer zIuqT0zl8=5f_)h5BVZr1(7%U!YrJPlraDR1%>Bfc8A2VldNX8U*fontYP*ZiQ{)jDYGAzII{QB_#C$X zXfqqT`zO`xIW=D8w&!Qu_WqLFEx+b=>uLa5FZ$?B^f1ip>BRpKitRl!+W7;Va|h< zwRU{pgY(}D^UvQah;s$^QMuxM6=mgBt*QzdCPt-#8%NSE>1AEt^+K;us(s zm>rZj4it*_LYu^U34HI3_w_qB@%z3Gzyq)@kR+`UQIkBA-TTc*?UM%4KTc~z)R-56 zA9~WHla7O}gM37N@+y{pM@FA?0{xJWs5e(wo&+>N9-=Y1f#uE2>XV*EKja}A&0UsL zfF{U8v?TYjyzD7`(pmIF9-_tkm_@yw6iS8t8>Nl?o1{(s(a+JEmPV};CQxf;CbimZ zbTl=MQoN?7PoNa9sTpwg@S1A1Q3}^o+r){~HhD6&O`S?@)2GwXtcg5-(qx`LWh&2~ zHl5{LXUw41nKP+%)+}nh>n=KKpFwFQ)VpIQrB#60vnZ_wyz?$fYoLBk4zYe50EE?GiH=P#hN7V0frL}?x1;sQ#y z0N%ZX()*$Q(xudP&pp((d^xqPSV2b%m-75&_wanja-M(h3YKpzDx%g^tEjc4gj(0E zp`*?sO1DD2m8&S-23TA|=>veP*HG$$`fJxx+q!kswqXOcZQ4XfOV{%J`_}RN^&5Ep z#!W1LJ}JF+AWchV5&8Ooa=iwwGuLNr`iP+a-~{eJ2>v<3e}Me)`kbntv-qJ*9zPVt zJ8a|r!z$iE6>o4Nj}J!ij!fqMBP!k@74PU&9zPnzJ2svBkEwWvRlMUfc>H)2@6nmu z|EP*LsNxOH;_;y<-eY%h|6?lN5f$%54v(LR;ypfx`yW^Fj;eSk^LYGZ6z_?7-2a4% zcTB~5asiJ&8O8hVBJTgLig#Sad#ZrPpNisrZwdE*PsMvw#d~@wk3Sv7`~E%L|9us2 zNX2_*IgdXR#d~%I_dl!RJ*MKFD&p}|QM~6?asP8F-U$`&bP122j^dqJ!~JJeyvJ3% zvuk<$Y!vU@I_^KG;+<6Sp5MUZ&qwiI*u?!WsCZ8Z9w~q7h890i=mI+a=q%y?Q&h z*N`s8DW{ zSRUS^zmI;%L!4y(lO+T=1@aK5C2Pmyy*%ENe~f;}L!4%wF#cPw-R0%fT2Vo*RaMkl zQ$t6~%Bcz0?#&g{gll(Y6*b}7U0p*>xOUgoQQQ6ZQ`@#})aG*0(b_tmzvX_OzjYhW zf564^t!_89HZ)MH$3v~n&2)6To0@R#u5X|wT)P`R)P!qyQ!_Q;+TGGZZ98^QTU#5o z`FwQL+rsl7+`;o(+jxGvkL6o;?xfbPE^6)Wrq1fAJYQnX^+d~gg+ar%q+rE8tbkAO%|DA_;{=<*({GNR*-`d+tt$lse+TTyD`}fmP zsh65??GE%&6RzFCerm$Cdtg5`;o5!hAhjJnOl?PwP}{L%bo9VMo`2{t&mTO(^N${5 z`M5@(FXz{m^HJBF7c02`Mb$Ouyy{wYp^C>ZMDbp#;r^FYycboxm+N@^x_rI;;y{6(_-pS*aqj>LhasN9i-bEGf z-EJO#H;VV(Ztj0i#ruJZ_x@fUe?N-%!9(2tfr|IKiZ}cSj}J%jM)q<4h>G`yiWlnT z@lX`+!#?i+P{q5X;(gT5;~z!wez>3ef2iWUsp5TnkjFoc;{E6__y0)6drQUp@ev;X zaTM<-$GHC|D&E_I$JXxiV(q>l*6x?Z+Wo3nyDy5h`*pE)UlMEgTVrcCfwlARb?wH_ zGiUrY0D6nWwr%>r*Yp>rDG1#Kbr=cOPt;$&jqZZ_4CbT$qSjqdhe1E;FIeZBbQjD+ zeI^Kq`iok3K^+GDsK0z0-39d-%t!r2t-GKOgMQRsu+GhN7tBFzCgSsQ&|labfmY{) zIO;HnsK3P0T~Lp~IO;EI-34_R^rQX~t-GKf^_XHn)L+!P3+gcFNBt$1?t*#@#!-J! z>n^Cnpda;@Xx#<dHhJeK|v(NCbmNN_A2=r7@ZLVZH5&lW0GU z=`VOcJ*K~)9yF%EjOj0^*No{ek^0T;)L(ENLfu8tVHEvE(OncBM$um|A9WW+hq;;l zf_bRBC_2o|^cT!S-Q^UlAE?8e1su~~#`G8bOlVAh;qTeU^p}swnEq0FZA^dR`ouq) z{&M_(W9u($?FQXN(P0$*MbTXp9Y)b#*xC)ci=xBaOn+f(H|Q>k4s$d8g{|G7yM*w) zALuY21CHr0WBLoO-DCO-U%SWjmygJp{!)5vOn>>uT)Ut9A4Pw`|NGcGf?o~7$N7mI zDF>1z5I=^-$71pQU?Vq)U*Mm=lzq7Sh2X>gm0^zzkuS_JtZ!+DUkhDkhu;vjuU}tm zUpPM>{@G{GYxT7^dpcbmZoe0Q3Dni)bNRjS-R{ngwk}tLyWQ2*-O=Ie^g~9&4tH}) zdo#p6u0~INceAUdy~*cl@^!l0t*tI!m#d@G*VNML>6+Ko*)XrQbyr&+cpmgL_?cmE zT0FmSNz>i+ix(C)&G+2hRIsGLQ}1plbQijpEL_~^u3xZlQRDo(JvrfFe{Bcl0{C_t zx#o!snS_7q`ba1%ZR-6Y@d+2d!~?zWZ&aC=%ieQ-(}8{veK{03he{x$%?ug?43^?+Vr^F5BF)m`st zbG5enhDUWGntqb_6QF=c|O3*QsJ77hn4dO}QQ@SSk|ndVt3=2wS;Mj0gL_ zWZ9?imFt0m*&<(wD}3Q5c=^EEN?f^qC}uLCR7R2vQhks{U;bF4sQ2_hM#pi$z<#nn2 zE;N2G8z0K`SeZZgDlB{43xCn4@Jn}bf2nxgUmnM8Xq#9N#VyK~RL>k@$N5JODmr>}q~iEw_~Cuf&w&Dz+DPV(||M{x6k8fzkZ` E2V(Bb%7 literal 0 HcmV?d00001 From 75fa00815b34249f5cfbbbd50c874126c59392f5 Mon Sep 17 00:00:00 2001 From: antsaukk Date: Fri, 7 Nov 2025 15:42:25 +0000 Subject: [PATCH 04/21] add 32x128 file descriptors and code objects for tuning --- ...e_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128.co | Bin 0 -> 27752 bytes ...e_bf16_pertokenInt8_g1u1_tkw1_silu_32x128.co | Bin 0 -> 27240 bytes ...oe_stage1_bf16_pertokenFp8_doweight_g1u1.csv | 2 ++ 3 files changed, 2 insertions(+) create mode 100755 hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128.co create mode 100755 hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x128.co diff --git a/hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128.co b/hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128.co new file mode 100755 index 0000000000000000000000000000000000000000..fd64a2219642ace4ebb4047c008a44a22d6a2929 GIT binary patch literal 27752 zcmeHQd300PnZK4~+18chUADo1j12+DmNynd1d+i8ld#wrgAF)B*zyv*;U$44m}hyx z5VM(mrz}aDq$$lxHlwC#o0&oynwd78(@t|vr)T=d%$Yf-vwCK#`M!JabFLJ%1erg2 zPVJn>Pv7tRzWctneD7Vo_lv!^Wc3P7SeQdc_>Y!+LBhmm1c&(iyGH(DT{v=uBoh8L z6C=?Bjkyl){t9}tozjkw|%1>ozy{ zZyWH(c5af(2Ju1UX~6tCd1}78ed~YQRfpU6b=Rlv^K!_;_1*6eu8ZwYEEsYPt^GtJIan1X??<54g%J6`RJ3yc}e8_b&3qAXyK%$g344U+&5qLDqNgDsKkK`lD5<_tCnh zmhqkAT^UBfkC7L!)23+{xG5C)LM@S7} zge%3;Zm@5JtGNPTCE*Aq?QZ8|F=Tgu)4rU=Lx0Yi_@`aD*^~ z|Bfi0@Cc<79$|GTBe>A}&|c08TuY;ZSsI}7r6<>eQa zyk~3gXnM-ka0={Up>x z*l|#g|BruF4)NZ`KZaY60z9fUl784I^*v3PFKp1~1BXWMg@57YlHSXP(CfWcFcqWT zn~S_w?{y$X=)Gl#k$UeoM4jH-ju@r)Du`6?J%VV^doKehVBbJHM&tDYH~>EdQ~Sqx)+u?!)9AHFSv ze1?$E5b_y9K10Z72>A>lpW&m9UwozzlB{Sh?#2D4y=)(`=j(2T#r5A($fu(Ya(7hQ z51h@WC^}=1;1U}Bs0ApaMTN2?M0o(|f3wRRH0r)e(GC+$3&JnMae3b^TltM*kht{S)*f@3wVw2zbtThUzUsQ`QZxcz+32Cz4b+b{K0 zg1vFvezj*K*j3~9YdxF5-aKx<)l&ub)^YpYo@%h47`I1zB7F7{D_kTLd%oh!FHjUK zA^oJv>Z#hC!H?fo>)_U{8MZB(8~fmR`l_}XBjH>}+p}4hp@Dn__PM>pJ`!HKWMl~( zOM7y%qNJpFB96c@1-_p+GVBgVqZdBg9m^V%$uCD!$gf7HkiQ?DN`5_BQUv_gt%_1p zseP%k#A;QPv^3Asj0{DYHOmwJION8}C`xRs$4A~7<+Z*$ zitp`x9c3%w~qL5whSG(<--o5n;c{dkPTHerk*Q}`a{7#V9|u~Bh|o@pO(!1X{m$CVsQ>D8nRc>C_Rcs1Ly(N0rmsx z0S5si0fzt$fCm7LfWv@hz(asBfJXr107n250FMJ&08auY1D*z)0(cfM74SUZbij)} zc#dAF(i{1CyV;l>p>M>V9fHla>-CKuy*{It#Y7r=$>!PFQIU-i9ynH!k&T|n$c$al z!*Cls0LveKp&{gmgX3|9<7qF)^F+t|mmM6hDI8z)a(pGiqr0^{vj0{)oI^OCLVo`l zNzi#q*5SoHy6- zJa!IZ9-MLSihU&8a)IgNR6P7mYyiVDn#aU}(8`_c+eDQG9G;=E-I@anmbd2sHdKe?Ryr&NL; zwlB@j^HQrhKfQ+Mu^xe*Y!~N*19GitS`*^TSz43YGiYbb<^1eytx2)6bK?f`bMiQ! zIUnLhZy}$xi1W53T9cQ@zflalwJ6?%II|?)r1l`%8LK!ydkyeuZ2X(Z&nf48W+m{V zSCP-E=De*Y-sIr%@0S9fo0e`u%%7Fc>c^f&d){2mFUU?eDH&{hFY<+XoL@X2;_=u( z?;~$t#QCL5(oJ3-|IupTbBhX0i1{T2tbV*kpgnID=NGI2eij@5W8@3VIls6Pcsv$x zEkWL1&H1G@1ttfNzwIltvvb>CZn(8bSu`Nqia|S07Ht;yND@}|c3HXZ)^ABb zKe|Ley0z?lwW|HtB>J%hdH-v(0<}f%6#du@ez_lai+!L_c2OdGbAB2-X+l68-2-XWKX4k4>Tvjq0cx%EAAZS;HO`lu52{^XN^?>>a8eI|QGe z<6tpQol4_iF;AOD<6tq*m_g%U8B0##`BSFy{M2bYfBFoTZ|YSHv9Or>Jcd|UOauLf zSXfN^1`V;Wj46sC6Sv>vF=XQQ_xBqzar*}c4Vk$8`-cp1u$Z4aV2Fdod~nzh2aEae zAwwK2V?#qc|G)vBKRnFy4;^CprlUs;v9OqqjTmBKF`YPWh=s*;>ZBnSma!v844Jt7 zBO``P-2UUo4Vk$8Cr=tOar@7lHpIbVK6lm-2aEZ_c|#m5=1Uh1aj=Y?KF#ybp5^)H z&-46?7g>H6-m9UHW|BTd39B~rYPT3X;WdW72&W;7{P=FxIh`&WFdNVYXamdv%mK`T z_-u%`L3|F255JoQSC4EGem5J?251A!0n7o+Dyr(qF528tgS@c2*?=}c8(Rc#ocTMUDy8pFQmudzK4pAGRgh|gj1_rE=}jD5;l@tyL1x2wN8k`8P! z(7_rb-RCsZ{{3J-2lhd*51Z-uv5M(ahGMG?`{TD54kXqXo=f@~+XnI35O0I{92S57 z+m@bi|Ho>mI)?7w5=RfzB+%!a7CLkk>|R5({8Z6uiQ>5O zQ%{7JB*MKk5$>aja1Tv{`)4os`>B@nf`2etOA_E-nE?061h_{g!2Qt!{-Jm+@qqt8 z66C|ZFcI#9iEs~0g!^9~_=l&$6Ke1ungRJGSx}d)X8^v_55P0HOK{BktF~J4c^e+T zUO0aI*_1fe@SQqKJkKMmrhJ~Iz=MHuz+S*LfF8g~z`iff1AL}1D4y@h{T~II97!$SJj2LGUwdV3#V^S*2 zSZmz03K$D|*{D;n)`X~9R=dR{YM#eyPDtrAW6hRnyyoPnK34k_lc>3n*PNVk%#1Zp znZ|2Qjk?NepKcO0+xrx(F*W6NGvf4VubbsLYoSs&uvS?+uufSw5O^#sX=&7wo=z>Z zW>L#S579Ae8l^aInwd^1&YLo3QHt}X*$+_)^QNS^b7|7U57VUVY?_plL&xUKSFEIT3yi;N6-|2lahkMd4NY3Nj*gYC;_;6?&f{0F;qhzNv3N^)Iki+& zP)lVcwQSl%$I8kn-3sHaub{LBa6=`fPXKP*M5zu~$$~!6LU0lNb7X$Jx zJ<9n@Qr;;k?{X3MUk=Fo=5o$|Q_4FnzG$U4NYO z*QLC(Qr`1xxc~WpycgDS{sk%ToRs%sIrqO9koT<$&VNhFJ1^zERLT7>1?0WFiSsW@ zc^9O-SE{-Hm4Li&Z{hs6rM!z$-m5j-|7t+qcbuI6j+A#v^p5qJ*m z0z3}b2Y3?rCjn0b4g;PAz60<);Bml>_Skgn#`q$p-)W<=jpd{td;F4gL)!+@2!*8&^m+_&2WPcB=4Kl}a}Ft5$J) zy6|s)OtQhh`EhPrg@5a6$p-({HQb&l{7k(&$f1lm!@w@XI$iqDzac1IY=9_>S5QjKB{5LEPzw5q_Jj5Z+PW-*u3pfYj z5HrKI(du{CkC2Br#LUF_=#K%jAP&(Mp3dUr@0z#K5Ale$#DZv?V_=QBQQtQs=9@&V z@ZAw>%uV^sjF>;|Gd{+83Ba&G4NMYn_+!do$vKX}spbC~Y*>yx1ga zhVO7#Yhg-!G~(iEyk>h;I;(xDNz@GAxv)liNa#2fdEw#Aa)Y8yE z$F{ks5$71&YpD_E7r4j-mtL;HFBf#-Pq@IfAb=rD`796d@c$Bt3Ui4)Xv z>J%M2a+DfzjxlnK8gY(s`~)@P9OL9EYQ#ClnKLx$+&P+b;Q~#%bcv3gKEvbBp5yW7 zFYx$_msmW`C2qL*HSk8jweh>Pod2$LZM-2}Q(tp)|7!txuQzc1bt&(=Qr;U)-2X;E z-kUo(|E84pnw0le3-`YjkoR^Q=iip{UYGK2?&SWP0eSCqa{e7D?+q#M-EQuGHz4o5 zr#b(gl=r5T_x^6~e?K7agJ(GZft2@_lsCGE`$q%vZtdm#Eh+D9DbL%>{oa7Q@AYy1 zds5y_DeuDp?*A|#@B8~W|9vU%9Vzdl{oMaiK;93YSJ||0^l)eJSs+PjLTV2juN{7|L@03oL@mVlJ6C*XFEkLS zjX*zYA>?A1!^Er8-ae*Ldfr2Q4@hY zY9TOB)<7VS+K3@Q)I#9)sqb7-6M;NxAux~WbtDZ0Y9r8(S_t`_D{3N;M=b>A$r=dc zQ5!J~h*}8teots1P!oYXY9TO>>3<{*1ZpGDk6H-%ohxc0kVh>9=BXM8mv1_Eu=M63oxErk5e6*UpaqZUHeK%gc9dDKF%V<9vUsEI%xwGgTX0&Ubp$aBn4 z-n|CXLiqI~Xd)C?mkzWLcpk|5H01T5jX*>#gkL{`CIWdQXdy69)<7U{25m$?AZj7} z`Vlk{$m>80fqAmt4S79iBM>7&3*py~CqzGj76S8R4FtxUK^rjyh*}80egsVf@(G}Y zz&u$$hr9)}5s0XT@asp=L?E9ES_sUOH4w;82W`YKAZj7}`Vlk{$R~gn0`p`&9`Y8@ zMj$4G7Q(L|K@)*|Drg}vPt`ymHyv~le*Fkq2)}*=O@ssDGC&I; zB1!>K3*py~pou^pwGgrfVgYC(kVh?qUq6B-0(m=VAyf?n+Dkzbu^JGy5Ptm#nh50Q zffhp6Kr8@F1oDNTh4AY~&_p0_2Q7rEfk1mHXd))skCR#m_#QW@g_zVrOllz}wGfk9 z2=@QpD4MVS|0k1L2zZ`9sfGB9)k5I>5H%1-VBUqAh~t34v=A7F8VJ-GpeEuxU@$EN z#-Rq{GWbywaSbq-76RjB4aAEOhuVl&0RNa4g4GHd2&@$~5m+l~Ayf?n+Ng=Z8c_?u zY6cAi+Ng=ZnuBQ}Sk0h;z?xALfi(xyLa>@a1A(=oCIV}n)Iv;ZA@IG&q!xldm!8x@ zd`Kp>5T&0^Y9Y8*;4e)J@$>)wsuqIXw?PAeIs?>1pspa87J}WcK?8w01Jp#Io*!)IzA=6VOIY1lEXJ z2v##_Akaol1lAl(3&Cmz4FuMVnh2~pm==Q73>pZm6*Uo9>!cQ9QVW4|j7cp7pJPmF zAwDFNT8Pq5C$$iNnRATM|5UXQs8{KQh5zjk#2YF2FZ_Ba4kTG1{u&Sbv5Vi&W#A@x zO5nf$!PkZ#|4Zb9#+Lo0ig;m;!|JA1_^Ta@t?;)ztgBZSTk~^m@Sk-~PIFspgS*4o z?&@m9zsTY2Y;$%s!tYq?Xm9Cs*1B4qo!#y2Z5>?@QM<#{(A3%hez&vEy}i4^+08|Z!hP8fPZsEfE1>5K47uM&x z7u3&RIN!b9Ra@vPbS=!ESLfQEmtRnqyTCnnyzH-h067T!M34>L{w|}+Z}8{&pErph zr3L={OcCF_*q?|0p8$R$$g|7+`9i^$28_?cBFL^S{_$%?{7{`g{|TC!NTU9o8P!`2 zAAgD6!SHwczjQBlsKax?I}<8zGy2;z*iZ+uhC1y!=J;#mC&D z`Qsn+^YT@;pm>@3aY5b+^<&Ze@qp!dB`RC8AV1_^`4(~_{;oJ@arN4wHLC)Sv3e}- z2l0HwU#X)}lhh9a)xWEwy45~aZOp~qSJTw%fr_Of5U&Tgu1I04*NJGb|HJ2NQ}fm9 zfr=R-UiGW_f+l$Rz_C@odi_w*Dgsr%TEB`jz^|3^)$5gt>h($t!?r>FjQ9{NxLxY_ zUUB_W(E&dB#~~Z=IR_TZ$6s^AejXMm{tndm-+K~tU;*=i$6VY7{89T-{rgPqoaTR9 z3O=j*FGlc71*hhJI=~&Dx;f^m7j6-cwaPzhz(6inCgHQ1|D6C{%~x?F_|<(Uhbr~l zvWh?ufDkN2tzSKVo&Zj*ooDcWFE?;hw_o+C>s9SW_z)~V1|9J8a}MMG2Il`i8hkY~ literal 0 HcmV?d00001 diff --git a/hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x128.co b/hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x128.co new file mode 100755 index 0000000000000000000000000000000000000000..978419b0c9f64bde8cc9ef9cce39c7ae64644f77 GIT binary patch literal 27240 zcmeHQd300PnZK57S=N>01zWPg5E&Z+jxD^f7$S%aHkgFP#u#jt2w}_E!5dx@Xo7i` z7Ys3**>}p4q)D2Pooq%~+NLdJX=d7FPCCswou26*?U|m_>9ohwiRSz6z0X`JN@-;N z=sC4=9>4efzWclHd&_Ahd%UB$~5Ya$|?k%WJ=R(GF>KnSu(02_jT6j>o8j5(rWGdpJ)7ZYbfgibT?3t(&TO zep7cKw(yW#Fp3$W(?Iz5=v2MBeye`dUW4oR_byM}=4GJ6<$c=^E{knXEEqpQ_^g%F zDyrK)8t7w?vjG+^FIrMsL5TM`cVkUmo7=wC)7s=|Xgf4;#qMt1()I-@?R(Ddu5Rye zH@Mu@)t#@~NB@ldVRu7^$F;S-spk8^A z9W9VUb#)GsMDfTVY8E(H*YvVr7|HVrht_-L}Na5TZL>;*yaPH-%W* zZA+{RA-dBg7LAnH5@Kn$EpdAY(H$=FQu*?c8gD2emUr7CZ}f*y-SH|{m#W9(je{YU zcl#<&gizh#Dle9=9Vzlch{fH$$frW6?r@Qp%8PGp%Bvxkcl#=@g;0HCmFji0roM4x zlp(^4-Xde;I8DpN+PIFT&{H z%4T7;Oi?9c>4{3!{U}o|e}3k&W<&FnS2s2>(6W4*oLQ4*nWO z2Nx^b)JPFo| zLl`|=_7uq*q9u$j0yjkasC(Ex>K+~oqlYUV`1UFt5=t0-1a66*QTNb4>K+Ed=;3mE zbIaCSH&X}0=pt}K92s>FCq~`FnQzm>>-H_J%^fYSHqVwuPgA?Awtl;(#)S`gKg2f) zud=7e@c4M7fxDr8OH<@4U*l17GvdNEd!u{1tG2=2?s}xTb*t!{>wnp!iLD zOKbhZ?sm^eF}JEEFWXx^+u&h#z(Nnh4RS=pGxoMEEv>HV=8mTJO;6g}b~J8oZutD( zaLcf7t$}agBR$^Kkmv1>xF7b^wl>2C#O>fzAHhHDX>F@-ZW>hI5Adh>U;h{L30ioX zuf7ivP38f{!y63WcuqJ4kL}e7Q}DVVg2dSvS2?fdg;}Ow>$Je*gzePV2{3!>6Ylzx zaD}kpZY^ERVRg4YPVwI67ISOvKA62vYa~6eQtEq}2!BMM-wz3m&IkX(%O#zUb)nPw zY!E6&oi7J@tSKY2pnplEX( zPy_sFfd>RWBk;VyO9G!1c$i_5KZ(_m;!j~{@|ze=_fKay(La%4nm>)<6#o>4@&0&* zpx=#e%RtW%^bA4I5cCW|&k*zsLC^3$=jWd)gd{7PbGvZ6X)ibi9eI)0yW)GUE9B!L zC;4PZ+XI~4t|&8Rc-Kbo?TPJr9FD;Qq;jgavNFR_Sy`uO)^!r4^ouN#rSm~7=A*G~ zw8Nobdq-B5qGV@#k0tVcehwH18{Luw#|YpWz#PD1fKI@l1C|0>EO3kft^sTT#9FJATSq1wsnaunoYX>3a{&L2Z1D7JW)&A9tTMgWJ!BzMx7*_$@1i`KGuVLI8 z;8F#**1wiXG>A&I*XvjKt4%u7S96B!01TEyU|a;#WH>A>J?&ztLF*@q;7rP-nE?F=&I6gyP6k z{CW9`Vk4x7RNB0i>oWNM+hFV4upz^~eqCKR>`#B?2BRL1b&S31A~Q6gS74hviXDSd zCG!X8!@hJRCo4)yiZ}Wo>{H--h%>|Cbk_M`-r-zWmrTAGN+G`-8c+UxXaf1wP;nvf z8#XA)#*JR=4^LWAOGRHsqoSBhysyRBS5bzesHn~d^Nym$b&ka)b&itz>lUL9&I^9> z;Se11WJMQ@+xgcN^;mRFo2Dq!r+fK2#bx;766*fedSmTI#Srwk|!e%*4bx zhs9E7&>A{N)*bk$`?22nO4PCig9{uBl=(KBqNJsH7i458%9JVIs0TnB7pExk@m@c9 zeTdik#t^=@_mhuU&3^JGJAce0{|4ujmxnZ3U9_LPB66>SCKB4d2B?YjlfI#QpT2gV zpWGZWP9A(akNkBA&Vik|3vXQaYgTkZuYSL#6mgedv#Qg-3w9D2xb8O^r{b9Ko6QWp zMq`H8Y|bEYrC;hy(F$U;nWG_A&te*mNCmdNvI$vmHy+NaDD$FQOyyA+Mmsc1BS zUcgAe9zY#nAD|wvAJ71}7tjbe0B8o>4;Tk{5HJC75HJz&2%rV<7+^Bs3Bd7yrvOs{ z&j3yWJlBcG=*3E%ksr6~jM>q;Ivlh8u-XotuFk8|WpuGNdSe$^H#Iv(Ul;9#eWlme zdG-2??Xd%J8QcrgSD$MLIq2kgMB#YC$MFn_WcdqDj+YgVPx&}L8|{s}uGRNkZ-HY7 z`%}mtKP72eGoogS*37oI?YM$**8C++a%X#yB zu4Bgx>Y(r*TwdWgGoq$=oSE&V|f^p(>&RepP7so2<;8?|caxTx0p9OhX zzqFZLmpYg8ljd_BJElJ2tK& zKP{K@nX{n17+c6^&E>p(zSiX9?O!ee-c~ryggCi)oJk#n7-uZy{M6;Zr?LL8AwR8* z^O+UEi?ND)Ru$*%8^@WPy!~4xz~`i;n-KG+q_g^QOk+HAI_GC+r<;@v*1r$=f?Uqe zn+5H0-@w>M-Z7W+3+AVre7ybp%Ye@*%r_zC73Z`1@f?Bi%%z;4y&U){tp5*?FDT>u zyb9oPU%vNU4y>dKHnb)hH%hYqxI&qC85v8w{ zmPKCw4G9{LZZRG`T6VnJ)bUs^#$zM$f#+rgYSV8M*Q-fKP)fKEyklKovq)m;qc~ml%(|`GMncj~I{pfLF&OuAe#{4~g-3 z6!P+TJSoQGS*}yBSpnnGEyklKXgt=7@z@AFKWD?a9LCEwF&;aBm&fA{F&=j!vg7jz zjw4%_7>~U$zQi>V`HVecJnkE3Qm+ljPdg;W<56fYkH?c@Jf0nAQm-L6j&j^$JbGY! ziEAwKGwa28Yy_TPQ!XH1uuY7|4&deSxI>J`o#`g^nuOyhr%Q~-UKn3c|HydUBgW&t ze3N?ZV&n0U7>`GT#^Xsb9?#~R)N7nP{=ylL5ilO-eC56V4!Jz^4!JU#byXp|^_R#a>fH1dWHuq}9@Rm`qVoG#(~X zbTo~JX_!RN*wP69utrN`%K#&zXzVILT{Mj?kMNsgVrT+PW=d%SOlFgbCctEljim`N z4eMjL-axtDXySTvEYq9f<7qrhrg7tFJWQsfBpMHs$!ewXFb&7W)7aIpyo7NywgNCQ ziN>x0v{-5ET3CKc3Qd5?JYfP&fXO^@B29qFeAitx0jA;P6s{jXf$LKza{Z*cnBLT- z7~)|vb$bo*FqwLL4Dm3TcJ~?LVH#ExLnf}j*K5ec_3!C1Wa9ew^%*j8{rB`65@0ev zzSocdlX>5OAps`yf&GRAn1=iNxqk0nt{)iS`u+Qv-gM}oAs!~v;Xy+@Os1np4Dm3T zjvq6`!!&&Gpdl02e{j%{iR*vlh#?c#|JX4@Ca(X<6NUts%%@Ko5@0f)J!43K$$b8t zApxf06DPR-)G4k%bB62Bon!iTyjH_FnoPQtEE4sJVj{L|z%D>LpcilkU^ie^VP$7_ z;kr(H;fBr`g%5V3EjugRU5K#h@#Guv4ibv}?VF zdN)SWZkLWKT@ZIe+zat;9mRe#^t_%PQBSX^r%%+g`y1++V(eB{i0^dYwyvHkJ?&j@ zpnV&Sbhpb)d-g#5IK=xPK47LJ+bWLN)-P(?BWl|#YJ2<}YD-W2_S>qzDvs`1pFsC+ zOr(#yEVTa+#D^h13h{9Z-&V=Io&izMK2gtpQO|*IsAo#*x36cQDw*zEKc4R2m`V@0 zCeeYD5TAzlEX3z0(SL+z4*2~sibg7a&9u(9=k(3;Ycd%oYCru%(UwOddZWuDlK^|@ zT84eGYZ>1^j_3DUd46C5&+or$Z6rLO$nZzQa{&0hfX|#r;i=EE68ZBW{LX^kp}OID z6e;}OXIX@7QaElV3t3#FaI7VZSX`%Ye25gWxIy9AM2cD5qHt^_i&@;RaD137Vexi_ z<9A32iyu=svS*OEj+=h4imjXbM{C%+)!gK<=OZ51{VR{_f5GF1U-G!|S3GX{4Uf0| zCy(2I&*P5&;_>!B@p#9ddHmS_@pz{n;^JwYNyV9+eoZAjBWPKaHGP^P>z+(QB3x?{ z;X0ED*O)}Oz9{74&lIheD9(#N^G0h)5?o7?;5w27*N`N*esn>;hiXX|J}Bgn^V4Tzega`OqmNN9(si>hYr(lff1Gsr_b zL|syw`2wIG+94XEx>>u7Y2C_Y` z+z$0(jd3xH%!mmlQELvbH7=#xjI}0AEQh{eEE^*gtTi#Fn$>PGiJE8fniEsn%viH! zBCk0)rkmA1-Xv-+;58?w95!Rk<0tZ(Q)4c%+9#Pr&5mvbYfMdf$&5H@;!9@vo~=MB z=v|?#=v}F->Ze(Dcm<$ zr%$KWd+wpu>}+bCF@p|Io6g&3-ox8xW%Ks-8LYh}H^YReeSvlETxz}dUTU2`pIYy`j}8~ih+2e6XR8%1CH19jND^fh3ldXuJwqMquyiJdi?JAGV*YZN1UAWsredU2DC$*8ue?| z4!!FoSO7Uvk!0@v_s5{(#ERa{oY3&+975pjf?#NFbmot+N080JNdib4a`G( zM0-+xEZ$>ajX5#znGy3$qE`6shBfA-d}>C_oA@ce$AIq(SnJG~U$fe0n?%j{FBx;85U|4HG%D7m>c@uffj+k^-`vQ}w8NPF2jgFN3Si}Vr z^JD++dknXmTB@t5#p9utEnDdDCO0+WJ;vs0YQ%et8V@z%Jx1*oYQ%et`g&^Jx|LcR z8>zLqnGV<0^Y#yI?ti7eHi(0z7sin7< zT6XWI!%7!5;ys49n;P*RqoD+i#I;Xzq z;rSPXbT4h;{7aJV1xfdEJqFKTRV9Etsvdok8=KPN%yLx8`{b9LqWRh zyEuPc(!D0>d|f>63(~#Q&G~mE-8D(~ZZFTj8>D-0H|O7zbgxUg_xJGp`$4)NJkI$a zNV+#9-4FNi{11b4KRUqqA4$45CEW*yc>aSR-H#7*{>PH;ElKy2qdfnUAl*-obN;82 z?rlkT<0Q}D2-4j=&H0;>Zb;I7c$Vis4AOmcp7S3`y6Zy6?lG>4dyE&wJ;ux89^+MU zk8w@hW4s~mG2RmQ81PLQukGwz_1Hbe&_CKe2EN<4`M+M+KMJ4p-miH9@ZLau=ZKyF z1+Jgy1sHkfi0`b02LQbRm`5*w{LT?Q0m!2l0Clnl0D1HV^Z=q4K=uHjCjfc$0-!F~ z1AyKD%%c}Te&>ju0OZjNfI8U&fINBw`T@}kfa|Beb3{)7^5_LX9rLS59su+PU>>~y z@;gWL1R#%I0MyAI0OZjdFaU^N0QP=McmU87fINBuP{;f)k_Q020hmWGfc(x8Jpst0 z7XWpt2LL(r1q8f%1TO&Fhv1uWLLNNH)wQJpuAPW;pL2LwNxL#v^zF6j)XycmeP@ko{T6>%bd;h+cqz@d%y(Iy^)PolwU_2fb;}N_7sFOVa*xn4@ zfPO&q0tAdl@B|>A2wnix$^J3qE#M76L@z+Vcmz)X@~PkjK%MLXKz~bK+29F49=!ko;}JXo$UDFbpn3o> zUI3neWq{}f2pEsx2|#`(cmZS&z-;gYAYTApfPnD`o&e+>-~~`U02nU-Przu$Ju0mi%l;7u6w0*rY9@c+F~G++P!55~Lz@H~6W3-C|t1;G0w^Z*=$`wsL3903gF z1;BRb0l@d{=m|Ij7|IKP?a%{o0rKbxxC|J|3xMrp55QB<4!r@-0)CShfYk~f0IU@~ z0az<~0aOnF#^?#a8qo{DY6cGg#^?#annQU3Sk2%8z?#t$fHjBm0(yl@_#ljfOYumUI2F81`hzfXGc!}zE=zsU>0Y6TB~ z>IuMF(F>q{Prw*G0azn?0a(r80l*kN0a$Y=F953M3)$S%&TSrSvb89=asNU+{ zQs1-%@*Y=>XLH9ESAA1$v#Ykb)#Ywza5cBNT3VZH>l-|6Gum3KXEZcC+?Wl_L!Rn( z7T9WM<`m4SoxOQxUO{b+XLjwZIkP;Q-PHx|0{5J}nKkasxq10DIkP>}N6P-%4A>#y z6HQjP1*VKDUmnQwKW`CDO7a8w$)bJ3yg(lO^zex$k1Y)33j|*h)c+#EZ(kSKeud!s zHwW?`VyMxRP4NHM$e$_*Lj6t`SdT;gP8YE^HMe_+y=F%f{OTC7Z)xhV*SXv3i1-)z z_EutVZLV>*yNSKsv%Q_Qb~n~n1MO*OZH8T1Qv*Ac*sGfx@i*EC{(@P%doy4isQDa6 z(%|0gX>jG{<<1hbIfb`o^K$c4Hos`0Iy!FtqZY@$DpSt~Dwc?bcs{^o=|!xb zCt@M~AHTm@)vMZ&sQp{=PS_- zTL*O>@ot!Kz104F;{2te6LRv$DF-k=118kt&)l%jJp#qw0UG&xKY|V}U>5MG#dW}s zIxaU=^SFfr|Jy<^uWr9M!7mV;s{go~Cq5SX9DZUFt$D2@0n99i3G2_1Ntjpl&j<0? z7C$Pkfq8Y?v4r?Xso^s=(GUy}1g{}#{p#`aAaH8!T*3dn;NhsQznW8*tHyOO6G|Uf N%Uk?HZ748U|Nld1yjB1J literal 0 HcmV?d00001 diff --git a/hsa/gfx950/fmoe_2stages/fmoe_stage1_bf16_pertokenFp8_doweight_g1u1.csv b/hsa/gfx950/fmoe_2stages/fmoe_stage1_bf16_pertokenFp8_doweight_g1u1.csv index 80e783813e..e717c51eaa 100644 --- a/hsa/gfx950/fmoe_2stages/fmoe_stage1_bf16_pertokenFp8_doweight_g1u1.csv +++ b/hsa/gfx950/fmoe_2stages/fmoe_stage1_bf16_pertokenFp8_doweight_g1u1.csv @@ -53,3 +53,5 @@ tile_m,tile_n,knl_name,co_name 144,64,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x64_pf3E,fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x64_pf3.co 160,64,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf2E,fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf2.co 160,64,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf3E,fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf3.co +32,64,_ZN5aiter44fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x128E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x128.co +32,64,_ZN5aiter44fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128.co From e5e57875294522ff8d08ff72395c9fb37a43d15a Mon Sep 17 00:00:00 2001 From: antsaukk Date: Fri, 7 Nov 2025 16:17:45 +0000 Subject: [PATCH 05/21] move code objects and kernel descriptors to correct csv --- hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv | 1 + hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv | 1 - hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv | 1 + hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv | 1 - 4 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv index 1e62666e2c..ae9c6e7abc 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv @@ -13,3 +13,4 @@ _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x256E,fmoe_bf16_pertoke _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x512E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x512.co,0,1,0,1,0,32,512 _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x384E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x384.co,0,1,0,1,0,32,384 _ZN5aiter53fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x448E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x448.co,0,1,0,1,1,32,448 +_ZN5aiter43fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co,0,0,0,1,0,32,64 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv index cce9c4aa95..9aebb8aec4 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv @@ -13,4 +13,3 @@ _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x128E,fmoe_bf16_pertokenI _ZN5aiter51fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x320E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x320.co,0,0,0,1,1,32,320 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x512E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x512.co,0,0,0,1,0,32,512 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x192E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x192.co,0,0,0,1,0,32,192 -_ZN5aiter43fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co,0,0,0,1,0,32,64 diff --git a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv index 6059a431bf..0c46448894 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv @@ -13,3 +13,4 @@ _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x448E,fmoe_bf16_pertoke _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x128E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x128.co,0,1,0,1,0,32,128 _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x192E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x192.co,0,1,0,1,0,32,192 _ZN5aiter53fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x512E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x512.co,0,1,0,1,1,32,512 +_ZN5aiter43fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x64E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x64.co,0,0,0,1,0,32,64 diff --git a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv index 96000cda17..8e1f534655 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv @@ -13,4 +13,3 @@ _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x192E,fmoe_bf16_pertokenI _ZN5aiter51fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x512E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x512.co,0,0,0,1,1,32,512 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x256E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x256.co,0,0,0,1,0,32,256 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x384E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x384.co,0,0,0,1,0,32,384 -_ZN5aiter43fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x64E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x64.co,0,0,0,1,0,32,64 From 5e37fc93bbeec1622aeb98a253d5c92dafbf5f05 Mon Sep 17 00:00:00 2001 From: Anton Saukkonen <63663359+antsaukk@users.noreply.github.com> Date: Tue, 11 Nov 2025 12:30:11 +0000 Subject: [PATCH 06/21] remove unnecessary import, add quant type argument --- aiter/fused_moe.py | 11 +++-------- aiter/fused_moe_bf16_asm.py | 1 + .../fmoe_stage1_bf16_pertokenFp8_doweight_g1u1.csv | 2 -- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 4917f137c3..d24546e0ea 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -26,7 +26,6 @@ from aiter.jit.utils.torch_guard import torch_compile_guard from aiter.utility import fp4_utils from aiter.utility.fp4_utils import moe_mxfp4_sort -#from aiter.fused_moe_bf16_asm import fused_moe_stage1_tkw1 BLOCK_SIZE_M = 32 @@ -246,9 +245,6 @@ def fused_moe_( ) if metadata.run_1stage and not doweight_stage1: - assert ( - doweight_stage1 == False - ), "doweight_stage1 not support in fused_moe_1stage" return metadata.stage1( hidden_states, w1, @@ -465,10 +461,9 @@ def get_block_size_M(token, topk, expert, inter_dim): { (ActivationType.Silu, QuantType.per_1x32, dtypes.bf16, dtypes.fp4x2, dtypes.fp4x2, True) : aiter.fmoe_g1u1, (ActivationType.Silu, QuantType.per_1x128, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_fp8_blockscale_g1u1, - (ActivationType.Silu, QuantType.No, dtypes.bf16, dtypes.bf16, dtypes.bf16, False) : aiter.fmoe, - #(ActivationType.Silu, QuantType.No, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_fp8_g1u1_a16, - (ActivationType.Silu, QuantType.No, dtypes.bf16, dtypes.i8, dtypes.i8, False) : aiter.fmoe_int8_g1u0_a16, - (ActivationType.Silu, QuantType.No, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_g1u1_tkw1, + (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.bf16, dtypes.bf16, False) : aiter.fmoe, + (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, False) : aiter.fmoe_int8_g1u0_a16, + (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_g1u1_tkw1, } } # fmt: on diff --git a/aiter/fused_moe_bf16_asm.py b/aiter/fused_moe_bf16_asm.py index 2167a9e091..26bec95dd9 100755 --- a/aiter/fused_moe_bf16_asm.py +++ b/aiter/fused_moe_bf16_asm.py @@ -440,6 +440,7 @@ def asm_moe_tkw1( hidden_states, w1, w2, topk_weight, topk_ids, expert_mask=expert_mask, activation=activation, + quant_type=QuantType.per_Token, doweight_stage1=True, w1_scale=fc1_scale, w2_scale=fc2_scale, a1_scale=fc1_smooth_scale, a2_scale=fc2_smooth_scale, diff --git a/hsa/gfx950/fmoe_2stages/fmoe_stage1_bf16_pertokenFp8_doweight_g1u1.csv b/hsa/gfx950/fmoe_2stages/fmoe_stage1_bf16_pertokenFp8_doweight_g1u1.csv index e717c51eaa..80e783813e 100644 --- a/hsa/gfx950/fmoe_2stages/fmoe_stage1_bf16_pertokenFp8_doweight_g1u1.csv +++ b/hsa/gfx950/fmoe_2stages/fmoe_stage1_bf16_pertokenFp8_doweight_g1u1.csv @@ -53,5 +53,3 @@ tile_m,tile_n,knl_name,co_name 144,64,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x64_pf3E,fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x64_pf3.co 160,64,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf2E,fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf2.co 160,64,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf3E,fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf3.co -32,64,_ZN5aiter44fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x128E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x128.co -32,64,_ZN5aiter44fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128.co From 2d0c5a1f471b4117bd57d2e6e8dd7725bcba0dab Mon Sep 17 00:00:00 2001 From: Anton Saukkonen <63663359+antsaukk@users.noreply.github.com> Date: Tue, 11 Nov 2025 14:40:54 +0000 Subject: [PATCH 07/21] move fused_moe_stage1_tkw1 into fused_moe.py --- aiter/fused_moe.py | 165 +++++++++++++++++++++++++++++++++++- aiter/fused_moe_bf16_asm.py | 155 --------------------------------- 2 files changed, 163 insertions(+), 157 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index d24546e0ea..1dfcfcee09 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -269,7 +269,12 @@ def fused_moe_( return metadata.stage1( hidden_states, w1, w2, - topk_weight, topk_ids, + topk_ids, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + moe_buf, w1_scale, w2_scale, a1_scale, a2_scale, a16, @@ -422,6 +427,156 @@ def fused_moe_1stage( return moe_buf +def fused_moe_stage1_tkw1( + hidden_states, + w1, # [expert(local_expert:EP), inter_dim*2, dim] N,K + w2, # [expert(local_expert:EP), dim, inter_dim] + topk_ids, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + moe_buf, + # following for int8 quant + fc1_scale=None, # [expert(local_expert:EP), inter_dim, 1] + fc2_scale=None, # [expert(local_expert:EP), model_dim, 1] + fc1_smooth_scale=None, # [expert(local_expert:EP), 1, model_dim] + fc2_smooth_scale=None, # [expert(local_expert:EP), 1, inter_dim] + a16=False, + per_tensor_quant_scale=None, + expert_mask=None, + activation=ActivationType.Silu, + kernelName: str = "" +): + E, model_dim, inter_dim = w2.shape + M, topk = topk_ids.shape + device = topk_ids.device + lastdim_mul = 8 if w1.dtype in {dtypes.i32, torch.uint32} else 1 + + if fc1_scale is None: + # pure bf16 + aiter.fmoe( + moe_buf, + hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + ) + elif a16: + # a16w8 smooth quant fmoe + if w1.dtype == dtypes.fp8 and inter_dim * 2 == w1.shape[1]: + aiter.fmoe_fp8_g1u1_a16( + moe_buf, + hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + fc1_scale, + fc2_scale, + fc1_smooth_scale, + fc2_smooth_scale, + ) + elif w1.dtype == dtypes.i8 and inter_dim == w1.shape[1]: + aiter.fmoe_int8_g1u0_a16( + moe_buf, + hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + fc1_scale, + fc2_scale, + fc1_smooth_scale, + fc2_smooth_scale, + ) + else: + raise ValueError(f"Invalid args: {w1.dtype} {w1.shape=} {w2.shape=}") + + else: + # a8w8 fmoe, opt: smooth quant + a8_type = ( + w1.dtype + if w1.dtype != dtypes.i32 and w1.dtype != torch.uint32 + else dtypes.fp8 + ) + if fc1_smooth_scale is not None: + a8 = torch.empty((topk * M, model_dim), dtype=a8_type, device=device) + a8_scale = torch.empty((topk * M), dtype=dtypes.fp32, device=device) + + # moe_smoothquant_fwd need topk_ids which contains local_expert_id + if expert_mask is not None: + local_expert_hash = expert_mask.cumsum(0, dtype=dtypes.i32) + local_expert_hash[local_expert_hash > 0] -= 1 + topk_ids = local_expert_hash[topk_ids] + + aiter.moe_smoothquant_fwd( + a8, hidden_states, fc1_smooth_scale, topk_ids, a8_scale + ) + else: + if ( + w1.dtype == dtypes.fp8 + or w1.dtype == dtypes.i32 + and w1.dtype == torch.uint32 + ): + a8 = torch.empty((M, model_dim), dtype=a8_type, device=device) + a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) + if per_tensor_quant_scale is None: + aiter.dynamic_per_token_scaled_quant(a8, hidden_states, a8_scale) + else: + aiter.static_per_tensor_quant( + a8, hidden_states, per_tensor_quant_scale + ) + a8_scale.fill_(per_tensor_quant_scale) + elif w1.dtype == dtypes.i8: + a8 = torch.empty((M, model_dim), dtype=w1.dtype, device=device) + a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) + fc1_smooth_scale = torch.ones( + model_dim, dtype=dtypes.fp32, device=device + ) + aiter.smoothquant_fwd(a8, hidden_states, fc1_smooth_scale, a8_scale) + else: + logger.warning("FMOE fall into pure torch quant...") + a8, a8_scale = aiter.pertoken_quant(hidden_states, quant_dtype=w1.dtype) + if w2.shape[2] * 2 * lastdim_mul == w1.shape[1]: + fmoe_func = aiter.fmoe_g1u1_tkw1 + + else: + raise ValueError( + f"Invalid MoE weight: {w1.shape=} {w2.shape=} {lastdim_mul}" + ) + + fmoe_func( + moe_buf, + a8, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + a8_scale, + fc1_scale, + fc2_scale, + kernelName, + fc2_smooth_scale, + activation, + ) + # fc2_smooth_scale) + return moe_buf + + @functools.lru_cache(maxsize=1024) def get_block_size_M(token, topk, expert, inter_dim): cu_num = get_cu_num() @@ -645,6 +800,9 @@ def FinalFunc(): f"[fused_moe] using {'1stage' if run_1stage else '2stage'} {'default' if cfg is None else tag} for {keys} " ) if run_1stage and not doweight_stage1: + logger.info( + f"[get_2stage_cfgs] run_1stage" + ) return MOEMetadata( functools.partial( fused_moe_1stage, @@ -658,9 +816,12 @@ def FinalFunc(): run_1stage, ) elif run_1stage and doweight_stage1: + logger.info( + f"[get_2stage_cfgs] run_1stage and doweight_stage1" + ) return MOEMetadata( functools.partial( - aiter.fused_moe_bf16_asm.fused_moe_stage1_tkw1, + fused_moe_stage1_tkw1, kernelName=kernelName1 ), None, diff --git a/aiter/fused_moe_bf16_asm.py b/aiter/fused_moe_bf16_asm.py index 26bec95dd9..7baeb105f9 100755 --- a/aiter/fused_moe_bf16_asm.py +++ b/aiter/fused_moe_bf16_asm.py @@ -264,161 +264,6 @@ def asm_moe( # fc2_smooth_scale) return moe_buf -# TODO: move into fused_moe.py when module is deleted -def fused_moe_stage1_tkw1( - hidden_states, - w1, # [expert(local_expert:EP), inter_dim*2, dim] N,K - w2, # [expert(local_expert:EP), dim, inter_dim] - topk_weight, - topk_ids, - # following for int8 quant - fc1_scale=None, # [expert(local_expert:EP), inter_dim, 1] - fc2_scale=None, # [expert(local_expert:EP), model_dim, 1] - fc1_smooth_scale=None, # [expert(local_expert:EP), 1, model_dim] - fc2_smooth_scale=None, # [expert(local_expert:EP), 1, inter_dim] - a16=False, - per_tensor_quant_scale=None, - expert_mask=None, - activation=ActivationType.Silu, - kernelName: str = "" -): - E, model_dim, inter_dim = w2.shape - global_E = E - if expert_mask is not None: - global_E = expert_mask.numel() - M, topk = topk_ids.shape - dtype = hidden_states.dtype - device = topk_ids.device - lastdim_mul = 8 if w1.dtype in {dtypes.i32, torch.uint32} else 1 - sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, moe_buf = ( - moe_sorting_ck( - topk_ids, topk_weight, global_E, model_dim, dtype, BLOCK_SIZE_M, expert_mask - ) - ) - - if fc1_scale is None: - # pure bf16 - aiter.fmoe( - moe_buf, - hidden_states, - w1, - w2, - sorted_ids, - sorted_weights, - sorted_expert_ids, - num_valid_ids, - topk, - ) - elif a16: - # a16w8 smooth quant fmoe - if w1.dtype == dtypes.fp8 and inter_dim * 2 == w1.shape[1]: - aiter.fmoe_fp8_g1u1_a16( - moe_buf, - hidden_states, - w1, - w2, - sorted_ids, - sorted_weights, - sorted_expert_ids, - num_valid_ids, - topk, - fc1_scale, - fc2_scale, - fc1_smooth_scale, - fc2_smooth_scale, - ) - elif w1.dtype == dtypes.i8 and inter_dim == w1.shape[1]: - aiter.fmoe_int8_g1u0_a16( - moe_buf, - hidden_states, - w1, - w2, - sorted_ids, - sorted_weights, - sorted_expert_ids, - num_valid_ids, - topk, - fc1_scale, - fc2_scale, - fc1_smooth_scale, - fc2_smooth_scale, - ) - else: - raise ValueError(f"Invalid args: {w1.dtype} {w1.shape=} {w2.shape=}") - - else: - # a8w8 fmoe, opt: smooth quant - a8_type = ( - w1.dtype - if w1.dtype != dtypes.i32 and w1.dtype != torch.uint32 - else dtypes.fp8 - ) - if fc1_smooth_scale is not None: - a8 = torch.empty((topk * M, model_dim), dtype=a8_type, device=device) - a8_scale = torch.empty((topk * M), dtype=dtypes.fp32, device=device) - - # moe_smoothquant_fwd need topk_ids which contains local_expert_id - if expert_mask is not None: - local_expert_hash = expert_mask.cumsum(0, dtype=dtypes.i32) - local_expert_hash[local_expert_hash > 0] -= 1 - topk_ids = local_expert_hash[topk_ids] - - aiter.moe_smoothquant_fwd( - a8, hidden_states, fc1_smooth_scale, topk_ids, a8_scale - ) - else: - if ( - w1.dtype == dtypes.fp8 - or w1.dtype == dtypes.i32 - and w1.dtype == torch.uint32 - ): - a8 = torch.empty((M, model_dim), dtype=a8_type, device=device) - a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) - if per_tensor_quant_scale is None: - aiter.dynamic_per_token_scaled_quant(a8, hidden_states, a8_scale) - else: - aiter.static_per_tensor_quant( - a8, hidden_states, per_tensor_quant_scale - ) - a8_scale.fill_(per_tensor_quant_scale) - elif w1.dtype == dtypes.i8: - a8 = torch.empty((M, model_dim), dtype=w1.dtype, device=device) - a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) - fc1_smooth_scale = torch.ones( - model_dim, dtype=dtypes.fp32, device=device - ) - aiter.smoothquant_fwd(a8, hidden_states, fc1_smooth_scale, a8_scale) - else: - logger.warning("FMOE fall into pure torch quant...") - a8, a8_scale = aiter.pertoken_quant(hidden_states, quant_dtype=w1.dtype) - if w2.shape[2] * 2 * lastdim_mul == w1.shape[1]: - fmoe_func = aiter.fmoe_g1u1_tkw1 - - else: - raise ValueError( - f"Invalid MoE weight: {w1.shape=} {w2.shape=} {lastdim_mul}" - ) - - fmoe_func( - moe_buf, - a8, - w1, - w2, - sorted_ids, - sorted_weights, - sorted_expert_ids, - num_valid_ids, - topk, - a8_scale, - fc1_scale, - fc2_scale, - kernelName, - fc2_smooth_scale, - activation, - ) - # fc2_smooth_scale) - return moe_buf - def asm_moe_tkw1( hidden_states, From 702b73c0a88c1db43e3f9e560fd6791ba0f8b26b Mon Sep 17 00:00:00 2001 From: Anton Saukkonen <63663359+antsaukk@users.noreply.github.com> Date: Tue, 11 Nov 2025 14:49:30 +0000 Subject: [PATCH 08/21] remove unnecessary kernel code object files --- ...e_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128.co | Bin 27752 -> 0 bytes ...e_bf16_pertokenInt8_g1u1_tkw1_silu_32x128.co | Bin 27240 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100755 hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128.co delete mode 100755 hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x128.co diff --git a/hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128.co b/hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x128.co deleted file mode 100755 index fd64a2219642ace4ebb4047c008a44a22d6a2929..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 27752 zcmeHQd300PnZK4~+18chUADo1j12+DmNynd1d+i8ld#wrgAF)B*zyv*;U$44m}hyx z5VM(mrz}aDq$$lxHlwC#o0&oynwd78(@t|vr)T=d%$Yf-vwCK#`M!JabFLJ%1erg2 zPVJn>Pv7tRzWctneD7Vo_lv!^Wc3P7SeQdc_>Y!+LBhmm1c&(iyGH(DT{v=uBoh8L z6C=?Bjkyl){t9}tozjkw|%1>ozy{ zZyWH(c5af(2Ju1UX~6tCd1}78ed~YQRfpU6b=Rlv^K!_;_1*6eu8ZwYEEsYPt^GtJIan1X??<54g%J6`RJ3yc}e8_b&3qAXyK%$g344U+&5qLDqNgDsKkK`lD5<_tCnh zmhqkAT^UBfkC7L!)23+{xG5C)LM@S7} zge%3;Zm@5JtGNPTCE*Aq?QZ8|F=Tgu)4rU=Lx0Yi_@`aD*^~ z|Bfi0@Cc<79$|GTBe>A}&|c08TuY;ZSsI}7r6<>eQa zyk~3gXnM-ka0={Up>x z*l|#g|BruF4)NZ`KZaY60z9fUl784I^*v3PFKp1~1BXWMg@57YlHSXP(CfWcFcqWT zn~S_w?{y$X=)Gl#k$UeoM4jH-ju@r)Du`6?J%VV^doKehVBbJHM&tDYH~>EdQ~Sqx)+u?!)9AHFSv ze1?$E5b_y9K10Z72>A>lpW&m9UwozzlB{Sh?#2D4y=)(`=j(2T#r5A($fu(Ya(7hQ z51h@WC^}=1;1U}Bs0ApaMTN2?M0o(|f3wRRH0r)e(GC+$3&JnMae3b^TltM*kht{S)*f@3wVw2zbtThUzUsQ`QZxcz+32Cz4b+b{K0 zg1vFvezj*K*j3~9YdxF5-aKx<)l&ub)^YpYo@%h47`I1zB7F7{D_kTLd%oh!FHjUK zA^oJv>Z#hC!H?fo>)_U{8MZB(8~fmR`l_}XBjH>}+p}4hp@Dn__PM>pJ`!HKWMl~( zOM7y%qNJpFB96c@1-_p+GVBgVqZdBg9m^V%$uCD!$gf7HkiQ?DN`5_BQUv_gt%_1p zseP%k#A;QPv^3Asj0{DYHOmwJION8}C`xRs$4A~7<+Z*$ zitp`x9c3%w~qL5whSG(<--o5n;c{dkPTHerk*Q}`a{7#V9|u~Bh|o@pO(!1X{m$CVsQ>D8nRc>C_Rcs1Ly(N0rmsx z0S5si0fzt$fCm7LfWv@hz(asBfJXr107n250FMJ&08auY1D*z)0(cfM74SUZbij)} zc#dAF(i{1CyV;l>p>M>V9fHla>-CKuy*{It#Y7r=$>!PFQIU-i9ynH!k&T|n$c$al z!*Cls0LveKp&{gmgX3|9<7qF)^F+t|mmM6hDI8z)a(pGiqr0^{vj0{)oI^OCLVo`l zNzi#q*5SoHy6- zJa!IZ9-MLSihU&8a)IgNR6P7mYyiVDn#aU}(8`_c+eDQG9G;=E-I@anmbd2sHdKe?Ryr&NL; zwlB@j^HQrhKfQ+Mu^xe*Y!~N*19GitS`*^TSz43YGiYbb<^1eytx2)6bK?f`bMiQ! zIUnLhZy}$xi1W53T9cQ@zflalwJ6?%II|?)r1l`%8LK!ydkyeuZ2X(Z&nf48W+m{V zSCP-E=De*Y-sIr%@0S9fo0e`u%%7Fc>c^f&d){2mFUU?eDH&{hFY<+XoL@X2;_=u( z?;~$t#QCL5(oJ3-|IupTbBhX0i1{T2tbV*kpgnID=NGI2eij@5W8@3VIls6Pcsv$x zEkWL1&H1G@1ttfNzwIltvvb>CZn(8bSu`Nqia|S07Ht;yND@}|c3HXZ)^ABb zKe|Ley0z?lwW|HtB>J%hdH-v(0<}f%6#du@ez_lai+!L_c2OdGbAB2-X+l68-2-XWKX4k4>Tvjq0cx%EAAZS;HO`lu52{^XN^?>>a8eI|QGe z<6tpQol4_iF;AOD<6tq*m_g%U8B0##`BSFy{M2bYfBFoTZ|YSHv9Or>Jcd|UOauLf zSXfN^1`V;Wj46sC6Sv>vF=XQQ_xBqzar*}c4Vk$8`-cp1u$Z4aV2Fdod~nzh2aEae zAwwK2V?#qc|G)vBKRnFy4;^CprlUs;v9OqqjTmBKF`YPWh=s*;>ZBnSma!v844Jt7 zBO``P-2UUo4Vk$8Cr=tOar@7lHpIbVK6lm-2aEZ_c|#m5=1Uh1aj=Y?KF#ybp5^)H z&-46?7g>H6-m9UHW|BTd39B~rYPT3X;WdW72&W;7{P=FxIh`&WFdNVYXamdv%mK`T z_-u%`L3|F255JoQSC4EGem5J?251A!0n7o+Dyr(qF528tgS@c2*?=}c8(Rc#ocTMUDy8pFQmudzK4pAGRgh|gj1_rE=}jD5;l@tyL1x2wN8k`8P! z(7_rb-RCsZ{{3J-2lhd*51Z-uv5M(ahGMG?`{TD54kXqXo=f@~+XnI35O0I{92S57 z+m@bi|Ho>mI)?7w5=RfzB+%!a7CLkk>|R5({8Z6uiQ>5O zQ%{7JB*MKk5$>aja1Tv{`)4os`>B@nf`2etOA_E-nE?061h_{g!2Qt!{-Jm+@qqt8 z66C|ZFcI#9iEs~0g!^9~_=l&$6Ke1ungRJGSx}d)X8^v_55P0HOK{BktF~J4c^e+T zUO0aI*_1fe@SQqKJkKMmrhJ~Iz=MHuz+S*LfF8g~z`iff1AL}1D4y@h{T~II97!$SJj2LGUwdV3#V^S*2 zSZmz03K$D|*{D;n)`X~9R=dR{YM#eyPDtrAW6hRnyyoPnK34k_lc>3n*PNVk%#1Zp znZ|2Qjk?NepKcO0+xrx(F*W6NGvf4VubbsLYoSs&uvS?+uufSw5O^#sX=&7wo=z>Z zW>L#S579Ae8l^aInwd^1&YLo3QHt}X*$+_)^QNS^b7|7U57VUVY?_plL&xUKSFEIT3yi;N6-|2lahkMd4NY3Nj*gYC;_;6?&f{0F;qhzNv3N^)Iki+& zP)lVcwQSl%$I8kn-3sHaub{LBa6=`fPXKP*M5zu~$$~!6LU0lNb7X$Jx zJ<9n@Qr;;k?{X3MUk=Fo=5o$|Q_4FnzG$U4NYO z*QLC(Qr`1xxc~WpycgDS{sk%ToRs%sIrqO9koT<$&VNhFJ1^zERLT7>1?0WFiSsW@ zc^9O-SE{-Hm4Li&Z{hs6rM!z$-m5j-|7t+qcbuI6j+A#v^p5qJ*m z0z3}b2Y3?rCjn0b4g;PAz60<);Bml>_Skgn#`q$p-)W<=jpd{td;F4gL)!+@2!*8&^m+_&2WPcB=4Kl}a}Ft5$J) zy6|s)OtQhh`EhPrg@5a6$p-({HQb&l{7k(&$f1lm!@w@XI$iqDzac1IY=9_>S5QjKB{5LEPzw5q_Jj5Z+PW-*u3pfYj z5HrKI(du{CkC2Br#LUF_=#K%jAP&(Mp3dUr@0z#K5Ale$#DZv?V_=QBQQtQs=9@&V z@ZAw>%uV^sjF>;|Gd{+83Ba&G4NMYn_+!do$vKX}spbC~Y*>yx1ga zhVO7#Yhg-!G~(iEyk>h;I;(xDNz@GAxv)liNa#2fdEw#Aa)Y8yE z$F{ks5$71&YpD_E7r4j-mtL;HFBf#-Pq@IfAb=rD`796d@c$Bt3Ui4)Xv z>J%M2a+DfzjxlnK8gY(s`~)@P9OL9EYQ#ClnKLx$+&P+b;Q~#%bcv3gKEvbBp5yW7 zFYx$_msmW`C2qL*HSk8jweh>Pod2$LZM-2}Q(tp)|7!txuQzc1bt&(=Qr;U)-2X;E z-kUo(|E84pnw0le3-`YjkoR^Q=iip{UYGK2?&SWP0eSCqa{e7D?+q#M-EQuGHz4o5 zr#b(gl=r5T_x^6~e?K7agJ(GZft2@_lsCGE`$q%vZtdm#Eh+D9DbL%>{oa7Q@AYy1 zds5y_DeuDp?*A|#@B8~W|9vU%9Vzdl{oMaiK;93YSJ||0^l)eJSs+PjLTV2juN{7|L@03oL@mVlJ6C*XFEkLS zjX*zYA>?A1!^Er8-ae*Ldfr2Q4@hY zY9TOB)<7VS+K3@Q)I#9)sqb7-6M;NxAux~WbtDZ0Y9r8(S_t`_D{3N;M=b>A$r=dc zQ5!J~h*}8teots1P!oYXY9TO>>3<{*1ZpGDk6H-%ohxc0kVh>9=BXM8mv1_Eu=M63oxErk5e6*UpaqZUHeK%gc9dDKF%V<9vUsEI%xwGgTX0&Ubp$aBn4 z-n|CXLiqI~Xd)C?mkzWLcpk|5H01T5jX*>#gkL{`CIWdQXdy69)<7U{25m$?AZj7} z`Vlk{$m>80fqAmt4S79iBM>7&3*py~CqzGj76S8R4FtxUK^rjyh*}80egsVf@(G}Y zz&u$$hr9)}5s0XT@asp=L?E9ES_sUOH4w;82W`YKAZj7}`Vlk{$R~gn0`p`&9`Y8@ zMj$4G7Q(L|K@)*|Drg}vPt`ymHyv~le*Fkq2)}*=O@ssDGC&I; zB1!>K3*py~pou^pwGgrfVgYC(kVh?qUq6B-0(m=VAyf?n+Dkzbu^JGy5Ptm#nh50Q zffhp6Kr8@F1oDNTh4AY~&_p0_2Q7rEfk1mHXd))skCR#m_#QW@g_zVrOllz}wGfk9 z2=@QpD4MVS|0k1L2zZ`9sfGB9)k5I>5H%1-VBUqAh~t34v=A7F8VJ-GpeEuxU@$EN z#-Rq{GWbywaSbq-76RjB4aAEOhuVl&0RNa4g4GHd2&@$~5m+l~Ayf?n+Ng=Z8c_?u zY6cAi+Ng=ZnuBQ}Sk0h;z?xALfi(xyLa>@a1A(=oCIV}n)Iv;ZA@IG&q!xldm!8x@ zd`Kp>5T&0^Y9Y8*;4e)J@$>)wsuqIXw?PAeIs?>1pspa87J}WcK?8w01Jp#Io*!)IzA=6VOIY1lEXJ z2v##_Akaol1lAl(3&Cmz4FuMVnh2~pm==Q73>pZm6*Uo9>!cQ9QVW4|j7cp7pJPmF zAwDFNT8Pq5C$$iNnRATM|5UXQs8{KQh5zjk#2YF2FZ_Ba4kTG1{u&Sbv5Vi&W#A@x zO5nf$!PkZ#|4Zb9#+Lo0ig;m;!|JA1_^Ta@t?;)ztgBZSTk~^m@Sk-~PIFspgS*4o z?&@m9zsTY2Y;$%s!tYq?Xm9Cs*1B4qo!#y2Z5>?@QM<#{(A3%hez&vEy}i4^+08|Z!hP8fPZsEfE1>5K47uM&x z7u3&RIN!b9Ra@vPbS=!ESLfQEmtRnqyTCnnyzH-h067T!M34>L{w|}+Z}8{&pErph zr3L={OcCF_*q?|0p8$R$$g|7+`9i^$28_?cBFL^S{_$%?{7{`g{|TC!NTU9o8P!`2 zAAgD6!SHwczjQBlsKax?I}<8zGy2;z*iZ+uhC1y!=J;#mC&D z`Qsn+^YT@;pm>@3aY5b+^<&Ze@qp!dB`RC8AV1_^`4(~_{;oJ@arN4wHLC)Sv3e}- z2l0HwU#X)}lhh9a)xWEwy45~aZOp~qSJTw%fr_Of5U&Tgu1I04*NJGb|HJ2NQ}fm9 zfr=R-UiGW_f+l$Rz_C@odi_w*Dgsr%TEB`jz^|3^)$5gt>h($t!?r>FjQ9{NxLxY_ zUUB_W(E&dB#~~Z=IR_TZ$6s^AejXMm{tndm-+K~tU;*=i$6VY7{89T-{rgPqoaTR9 z3O=j*FGlc71*hhJI=~&Dx;f^m7j6-cwaPzhz(6inCgHQ1|D6C{%~x?F_|<(Uhbr~l zvWh?ufDkN2tzSKVo&Zj*ooDcWFE?;hw_o+C>s9SW_z)~V1|9J8a}MMG2Il`i8hkY~ diff --git a/hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x128.co b/hsa/gfx950/fmoe_2stages/fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x128.co deleted file mode 100755 index 978419b0c9f64bde8cc9ef9cce39c7ae64644f77..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 27240 zcmeHQd300PnZK57S=N>01zWPg5E&Z+jxD^f7$S%aHkgFP#u#jt2w}_E!5dx@Xo7i` z7Ys3**>}p4q)D2Pooq%~+NLdJX=d7FPCCswou26*?U|m_>9ohwiRSz6z0X`JN@-;N z=sC4=9>4efzWclHd&_Ahd%UB$~5Ya$|?k%WJ=R(GF>KnSu(02_jT6j>o8j5(rWGdpJ)7ZYbfgibT?3t(&TO zep7cKw(yW#Fp3$W(?Iz5=v2MBeye`dUW4oR_byM}=4GJ6<$c=^E{knXEEqpQ_^g%F zDyrK)8t7w?vjG+^FIrMsL5TM`cVkUmo7=wC)7s=|Xgf4;#qMt1()I-@?R(Ddu5Rye zH@Mu@)t#@~NB@ldVRu7^$F;S-spk8^A z9W9VUb#)GsMDfTVY8E(H*YvVr7|HVrht_-L}Na5TZL>;*yaPH-%W* zZA+{RA-dBg7LAnH5@Kn$EpdAY(H$=FQu*?c8gD2emUr7CZ}f*y-SH|{m#W9(je{YU zcl#<&gizh#Dle9=9Vzlch{fH$$frW6?r@Qp%8PGp%Bvxkcl#=@g;0HCmFji0roM4x zlp(^4-Xde;I8DpN+PIFT&{H z%4T7;Oi?9c>4{3!{U}o|e}3k&W<&FnS2s2>(6W4*oLQ4*nWO z2Nx^b)JPFo| zLl`|=_7uq*q9u$j0yjkasC(Ex>K+~oqlYUV`1UFt5=t0-1a66*QTNb4>K+Ed=;3mE zbIaCSH&X}0=pt}K92s>FCq~`FnQzm>>-H_J%^fYSHqVwuPgA?Awtl;(#)S`gKg2f) zud=7e@c4M7fxDr8OH<@4U*l17GvdNEd!u{1tG2=2?s}xTb*t!{>wnp!iLD zOKbhZ?sm^eF}JEEFWXx^+u&h#z(Nnh4RS=pGxoMEEv>HV=8mTJO;6g}b~J8oZutD( zaLcf7t$}agBR$^Kkmv1>xF7b^wl>2C#O>fzAHhHDX>F@-ZW>hI5Adh>U;h{L30ioX zuf7ivP38f{!y63WcuqJ4kL}e7Q}DVVg2dSvS2?fdg;}Ow>$Je*gzePV2{3!>6Ylzx zaD}kpZY^ERVRg4YPVwI67ISOvKA62vYa~6eQtEq}2!BMM-wz3m&IkX(%O#zUb)nPw zY!E6&oi7J@tSKY2pnplEX( zPy_sFfd>RWBk;VyO9G!1c$i_5KZ(_m;!j~{@|ze=_fKay(La%4nm>)<6#o>4@&0&* zpx=#e%RtW%^bA4I5cCW|&k*zsLC^3$=jWd)gd{7PbGvZ6X)ibi9eI)0yW)GUE9B!L zC;4PZ+XI~4t|&8Rc-Kbo?TPJr9FD;Qq;jgavNFR_Sy`uO)^!r4^ouN#rSm~7=A*G~ zw8Nobdq-B5qGV@#k0tVcehwH18{Luw#|YpWz#PD1fKI@l1C|0>EO3kft^sTT#9FJATSq1wsnaunoYX>3a{&L2Z1D7JW)&A9tTMgWJ!BzMx7*_$@1i`KGuVLI8 z;8F#**1wiXG>A&I*XvjKt4%u7S96B!01TEyU|a;#WH>A>J?&ztLF*@q;7rP-nE?F=&I6gyP6k z{CW9`Vk4x7RNB0i>oWNM+hFV4upz^~eqCKR>`#B?2BRL1b&S31A~Q6gS74hviXDSd zCG!X8!@hJRCo4)yiZ}Wo>{H--h%>|Cbk_M`-r-zWmrTAGN+G`-8c+UxXaf1wP;nvf z8#XA)#*JR=4^LWAOGRHsqoSBhysyRBS5bzesHn~d^Nym$b&ka)b&itz>lUL9&I^9> z;Se11WJMQ@+xgcN^;mRFo2Dq!r+fK2#bx;766*fedSmTI#Srwk|!e%*4bx zhs9E7&>A{N)*bk$`?22nO4PCig9{uBl=(KBqNJsH7i458%9JVIs0TnB7pExk@m@c9 zeTdik#t^=@_mhuU&3^JGJAce0{|4ujmxnZ3U9_LPB66>SCKB4d2B?YjlfI#QpT2gV zpWGZWP9A(akNkBA&Vik|3vXQaYgTkZuYSL#6mgedv#Qg-3w9D2xb8O^r{b9Ko6QWp zMq`H8Y|bEYrC;hy(F$U;nWG_A&te*mNCmdNvI$vmHy+NaDD$FQOyyA+Mmsc1BS zUcgAe9zY#nAD|wvAJ71}7tjbe0B8o>4;Tk{5HJC75HJz&2%rV<7+^Bs3Bd7yrvOs{ z&j3yWJlBcG=*3E%ksr6~jM>q;Ivlh8u-XotuFk8|WpuGNdSe$^H#Iv(Ul;9#eWlme zdG-2??Xd%J8QcrgSD$MLIq2kgMB#YC$MFn_WcdqDj+YgVPx&}L8|{s}uGRNkZ-HY7 z`%}mtKP72eGoogS*37oI?YM$**8C++a%X#yB zu4Bgx>Y(r*TwdWgGoq$=oSE&V|f^p(>&RepP7so2<;8?|caxTx0p9OhX zzqFZLmpYg8ljd_BJElJ2tK& zKP{K@nX{n17+c6^&E>p(zSiX9?O!ee-c~ryggCi)oJk#n7-uZy{M6;Zr?LL8AwR8* z^O+UEi?ND)Ru$*%8^@WPy!~4xz~`i;n-KG+q_g^QOk+HAI_GC+r<;@v*1r$=f?Uqe zn+5H0-@w>M-Z7W+3+AVre7ybp%Ye@*%r_zC73Z`1@f?Bi%%z;4y&U){tp5*?FDT>u zyb9oPU%vNU4y>dKHnb)hH%hYqxI&qC85v8w{ zmPKCw4G9{LZZRG`T6VnJ)bUs^#$zM$f#+rgYSV8M*Q-fKP)fKEyklKovq)m;qc~ml%(|`GMncj~I{pfLF&OuAe#{4~g-3 z6!P+TJSoQGS*}yBSpnnGEyklKXgt=7@z@AFKWD?a9LCEwF&;aBm&fA{F&=j!vg7jz zjw4%_7>~U$zQi>V`HVecJnkE3Qm+ljPdg;W<56fYkH?c@Jf0nAQm-L6j&j^$JbGY! ziEAwKGwa28Yy_TPQ!XH1uuY7|4&deSxI>J`o#`g^nuOyhr%Q~-UKn3c|HydUBgW&t ze3N?ZV&n0U7>`GT#^Xsb9?#~R)N7nP{=ylL5ilO-eC56V4!Jz^4!JU#byXp|^_R#a>fH1dWHuq}9@Rm`qVoG#(~X zbTo~JX_!RN*wP69utrN`%K#&zXzVILT{Mj?kMNsgVrT+PW=d%SOlFgbCctEljim`N z4eMjL-axtDXySTvEYq9f<7qrhrg7tFJWQsfBpMHs$!ewXFb&7W)7aIpyo7NywgNCQ ziN>x0v{-5ET3CKc3Qd5?JYfP&fXO^@B29qFeAitx0jA;P6s{jXf$LKza{Z*cnBLT- z7~)|vb$bo*FqwLL4Dm3TcJ~?LVH#ExLnf}j*K5ec_3!C1Wa9ew^%*j8{rB`65@0ev zzSocdlX>5OAps`yf&GRAn1=iNxqk0nt{)iS`u+Qv-gM}oAs!~v;Xy+@Os1np4Dm3T zjvq6`!!&&Gpdl02e{j%{iR*vlh#?c#|JX4@Ca(X<6NUts%%@Ko5@0f)J!43K$$b8t zApxf06DPR-)G4k%bB62Bon!iTyjH_FnoPQtEE4sJVj{L|z%D>LpcilkU^ie^VP$7_ z;kr(H;fBr`g%5V3EjugRU5K#h@#Guv4ibv}?VF zdN)SWZkLWKT@ZIe+zat;9mRe#^t_%PQBSX^r%%+g`y1++V(eB{i0^dYwyvHkJ?&j@ zpnV&Sbhpb)d-g#5IK=xPK47LJ+bWLN)-P(?BWl|#YJ2<}YD-W2_S>qzDvs`1pFsC+ zOr(#yEVTa+#D^h13h{9Z-&V=Io&izMK2gtpQO|*IsAo#*x36cQDw*zEKc4R2m`V@0 zCeeYD5TAzlEX3z0(SL+z4*2~sibg7a&9u(9=k(3;Ycd%oYCru%(UwOddZWuDlK^|@ zT84eGYZ>1^j_3DUd46C5&+or$Z6rLO$nZzQa{&0hfX|#r;i=EE68ZBW{LX^kp}OID z6e;}OXIX@7QaElV3t3#FaI7VZSX`%Ye25gWxIy9AM2cD5qHt^_i&@;RaD137Vexi_ z<9A32iyu=svS*OEj+=h4imjXbM{C%+)!gK<=OZ51{VR{_f5GF1U-G!|S3GX{4Uf0| zCy(2I&*P5&;_>!B@p#9ddHmS_@pz{n;^JwYNyV9+eoZAjBWPKaHGP^P>z+(QB3x?{ z;X0ED*O)}Oz9{74&lIheD9(#N^G0h)5?o7?;5w27*N`N*esn>;hiXX|J}Bgn^V4Tzega`OqmNN9(si>hYr(lff1Gsr_b zL|syw`2wIG+94XEx>>u7Y2C_Y` z+z$0(jd3xH%!mmlQELvbH7=#xjI}0AEQh{eEE^*gtTi#Fn$>PGiJE8fniEsn%viH! zBCk0)rkmA1-Xv-+;58?w95!Rk<0tZ(Q)4c%+9#Pr&5mvbYfMdf$&5H@;!9@vo~=MB z=v|?#=v}F->Ze(Dcm<$ zr%$KWd+wpu>}+bCF@p|Io6g&3-ox8xW%Ks-8LYh}H^YReeSvlETxz}dUTU2`pIYy`j}8~ih+2e6XR8%1CH19jND^fh3ldXuJwqMquyiJdi?JAGV*YZN1UAWsredU2DC$*8ue?| z4!!FoSO7Uvk!0@v_s5{(#ERa{oY3&+975pjf?#NFbmot+N080JNdib4a`G( zM0-+xEZ$>ajX5#znGy3$qE`6shBfA-d}>C_oA@ce$AIq(SnJG~U$fe0n?%j{FBx;85U|4HG%D7m>c@uffj+k^-`vQ}w8NPF2jgFN3Si}Vr z^JD++dknXmTB@t5#p9utEnDdDCO0+WJ;vs0YQ%et8V@z%Jx1*oYQ%et`g&^Jx|LcR z8>zLqnGV<0^Y#yI?ti7eHi(0z7sin7< zT6XWI!%7!5;ys49n;P*RqoD+i#I;Xzq z;rSPXbT4h;{7aJV1xfdEJqFKTRV9Etsvdok8=KPN%yLx8`{b9LqWRh zyEuPc(!D0>d|f>63(~#Q&G~mE-8D(~ZZFTj8>D-0H|O7zbgxUg_xJGp`$4)NJkI$a zNV+#9-4FNi{11b4KRUqqA4$45CEW*yc>aSR-H#7*{>PH;ElKy2qdfnUAl*-obN;82 z?rlkT<0Q}D2-4j=&H0;>Zb;I7c$Vis4AOmcp7S3`y6Zy6?lG>4dyE&wJ;ux89^+MU zk8w@hW4s~mG2RmQ81PLQukGwz_1Hbe&_CKe2EN<4`M+M+KMJ4p-miH9@ZLau=ZKyF z1+Jgy1sHkfi0`b02LQbRm`5*w{LT?Q0m!2l0Clnl0D1HV^Z=q4K=uHjCjfc$0-!F~ z1AyKD%%c}Te&>ju0OZjNfI8U&fINBw`T@}kfa|Beb3{)7^5_LX9rLS59su+PU>>~y z@;gWL1R#%I0MyAI0OZjdFaU^N0QP=McmU87fINBuP{;f)k_Q020hmWGfc(x8Jpst0 z7XWpt2LL(r1q8f%1TO&Fhv1uWLLNNH)wQJpuAPW;pL2LwNxL#v^zF6j)XycmeP@ko{T6>%bd;h+cqz@d%y(Iy^)PolwU_2fb;}N_7sFOVa*xn4@ zfPO&q0tAdl@B|>A2wnix$^J3qE#M76L@z+Vcmz)X@~PkjK%MLXKz~bK+29F49=!ko;}JXo$UDFbpn3o> zUI3neWq{}f2pEsx2|#`(cmZS&z-;gYAYTApfPnD`o&e+>-~~`U02nU-Przu$Ju0mi%l;7u6w0*rY9@c+F~G++P!55~Lz@H~6W3-C|t1;G0w^Z*=$`wsL3903gF z1;BRb0l@d{=m|Ij7|IKP?a%{o0rKbxxC|J|3xMrp55QB<4!r@-0)CShfYk~f0IU@~ z0az<~0aOnF#^?#a8qo{DY6cGg#^?#annQU3Sk2%8z?#t$fHjBm0(yl@_#ljfOYumUI2F81`hzfXGc!}zE=zsU>0Y6TB~ z>IuMF(F>q{Prw*G0azn?0a(r80l*kN0a$Y=F953M3)$S%&TSrSvb89=asNU+{ zQs1-%@*Y=>XLH9ESAA1$v#Ykb)#Ywza5cBNT3VZH>l-|6Gum3KXEZcC+?Wl_L!Rn( z7T9WM<`m4SoxOQxUO{b+XLjwZIkP;Q-PHx|0{5J}nKkasxq10DIkP>}N6P-%4A>#y z6HQjP1*VKDUmnQwKW`CDO7a8w$)bJ3yg(lO^zex$k1Y)33j|*h)c+#EZ(kSKeud!s zHwW?`VyMxRP4NHM$e$_*Lj6t`SdT;gP8YE^HMe_+y=F%f{OTC7Z)xhV*SXv3i1-)z z_EutVZLV>*yNSKsv%Q_Qb~n~n1MO*OZH8T1Qv*Ac*sGfx@i*EC{(@P%doy4isQDa6 z(%|0gX>jG{<<1hbIfb`o^K$c4Hos`0Iy!FtqZY@$DpSt~Dwc?bcs{^o=|!xb zCt@M~AHTm@)vMZ&sQp{=PS_- zTL*O>@ot!Kz104F;{2te6LRv$DF-k=118kt&)l%jJp#qw0UG&xKY|V}U>5MG#dW}s zIxaU=^SFfr|Jy<^uWr9M!7mV;s{go~Cq5SX9DZUFt$D2@0n99i3G2_1Ntjpl&j<0? z7C$Pkfq8Y?v4r?Xso^s=(GUy}1g{}#{p#`aAaH8!T*3dn;NhsQznW8*tHyOO6G|Uf N%Uk?HZ748U|Nld1yjB1J From 8e6533955c4ecdd6aa8220565df40cdc396f87cc Mon Sep 17 00:00:00 2001 From: Anton Saukkonen <63663359+antsaukk@users.noreply.github.com> Date: Wed, 12 Nov 2025 08:42:11 +0000 Subject: [PATCH 09/21] Add missing comma --- aiter/fused_moe.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index d428585979..0d2ab0fc4f 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -105,7 +105,7 @@ def fused_moe( moe_sorting_dispatch_policy=0, dtype=None, a16=False, - per_tensor_quant_scale=None + per_tensor_quant_scale=None, # following for cktile support hidden_pad=0, intermediate_pad=0, @@ -133,7 +133,7 @@ def fused_moe( moe_sorting_dispatch_policy=moe_sorting_dispatch_policy, dtype=dtype, a16=a16, - per_tensor_quant_scale=per_tensor_quant_scale + per_tensor_quant_scale=per_tensor_quant_scale, hidden_pad=hidden_pad, intermediate_pad=intermediate_pad, bias1=bias1, @@ -194,7 +194,7 @@ def fused_moe_( moe_sorting_dispatch_policy: bool = 0, dtype: Optional[torch.dtype] = None, a16: bool = False, - per_tensor_quant_scale: torch.Tensor = None + per_tensor_quant_scale: torch.Tensor = None, hidden_pad: int = 0, intermediate_pad: int = 0, bias1: Optional[torch.Tensor] = None, @@ -640,9 +640,9 @@ def get_block_size_M(token, topk, expert, inter_dim): { (ActivationType.Silu, QuantType.per_1x32, dtypes.bf16, dtypes.fp4x2, dtypes.fp4x2, True) : aiter.fmoe_g1u1, (ActivationType.Silu, QuantType.per_1x128, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_fp8_blockscale_g1u1, - (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.bf16, dtypes.bf16, False) : aiter.fmoe, - (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, False) : aiter.fmoe_int8_g1u0_a16, - (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_g1u1_tkw1, + (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.bf16, dtypes.bf16, False) : aiter.fmoe, + (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, False) : aiter.fmoe_int8_g1u0_a16, + (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_g1u1_tkw1, } } # fmt: on From 515256372c029f7459682b89818370dcff9f54e8 Mon Sep 17 00:00:00 2001 From: Anton Saukkonen <63663359+antsaukk@users.noreply.github.com> Date: Wed, 12 Nov 2025 10:09:33 +0000 Subject: [PATCH 10/21] saved modified tuned fmoe config for testing purposes --- aiter/configs/tuned_fmoe.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/aiter/configs/tuned_fmoe.csv b/aiter/configs/tuned_fmoe.csv index bca541c83e..f6f2dbe25c 100644 --- a/aiter/configs/tuned_fmoe.csv +++ b/aiter/configs/tuned_fmoe.csv @@ -772,3 +772,4 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,536.7655,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,536.7655,1,47.26,2646.03 80,512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,560.4425,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,560.4425,1,90.53,2544.06 80,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,827.4898,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,827.4898,1,122.62,1736.35 +256,16,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter43fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0 From c2faf4d33f4865dc760213e33cf27e1357605065 Mon Sep 17 00:00:00 2001 From: Anton Saukkonen <63663359+antsaukk@users.noreply.github.com> Date: Wed, 12 Nov 2025 12:56:36 +0000 Subject: [PATCH 11/21] apply black required formatting --- aiter/fused_moe.py | 28 ++++++++++++---------------- aiter/fused_moe_bf16_asm.py | 14 ++++++++++---- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 0d2ab0fc4f..067d4102f6 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -162,7 +162,7 @@ def fused_moe_fake( moe_sorting_dispatch_policy: bool = 0, dtype: Optional[torch.dtype] = None, a16: bool = False, - per_tensor_quant_scale: torch.Tensor = None + per_tensor_quant_scale: torch.Tensor = None, ) -> torch.Tensor: device = topk_ids.device M, topk = topk_ids.shape @@ -287,19 +287,22 @@ def fused_moe_( elif metadata.run_1stage and doweight_stage1: return metadata.stage1( hidden_states, - w1, w2, + w1, + w2, topk_ids, sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, moe_buf, - w1_scale, w2_scale, - a1_scale, a2_scale, + w1_scale, + w2_scale, + a1_scale, + a2_scale, a16, per_tensor_quant_scale, expert_mask, - activation + activation, ) else: return fused_moe_2stages( @@ -470,7 +473,7 @@ def fused_moe_stage1_tkw1( per_tensor_quant_scale=None, expert_mask=None, activation=ActivationType.Silu, - kernelName: str = "" + kernelName: str = "", ): E, model_dim, inter_dim = w2.shape M, topk = topk_ids.shape @@ -828,9 +831,7 @@ def FinalFunc(): f"[fused_moe] using {'1stage' if run_1stage else '2stage'} {'default' if cfg is None else tag} for {keys} " ) if run_1stage and not doweight_stage1: - logger.info( - f"[get_2stage_cfgs] run_1stage" - ) + logger.info(f"[get_2stage_cfgs] run_1stage") return MOEMetadata( functools.partial( fused_moe_1stage, @@ -844,14 +845,9 @@ def FinalFunc(): run_1stage, ) elif run_1stage and doweight_stage1: - logger.info( - f"[get_2stage_cfgs] run_1stage and doweight_stage1" - ) + logger.info(f"[get_2stage_cfgs] run_1stage and doweight_stage1") return MOEMetadata( - functools.partial( - fused_moe_stage1_tkw1, - kernelName=kernelName1 - ), + functools.partial(fused_moe_stage1_tkw1, kernelName=kernelName1), None, block_m, ksplit, diff --git a/aiter/fused_moe_bf16_asm.py b/aiter/fused_moe_bf16_asm.py index 7baeb105f9..ea61d128dd 100755 --- a/aiter/fused_moe_bf16_asm.py +++ b/aiter/fused_moe_bf16_asm.py @@ -282,15 +282,21 @@ def asm_moe_tkw1( activation=ActivationType.Silu, ): return fused_moe( - hidden_states, w1, w2, topk_weight, topk_ids, + hidden_states, + w1, + w2, + topk_weight, + topk_ids, expert_mask=expert_mask, activation=activation, quant_type=QuantType.per_Token, doweight_stage1=True, - w1_scale=fc1_scale, w2_scale=fc2_scale, - a1_scale=fc1_smooth_scale, a2_scale=fc2_smooth_scale, + w1_scale=fc1_scale, + w2_scale=fc2_scale, + a1_scale=fc1_smooth_scale, + a2_scale=fc2_smooth_scale, a16=a16, - per_tensor_quant_scale=per_tensor_quant_scale + per_tensor_quant_scale=per_tensor_quant_scale, ) From 7a36ba6b97b390fd0f81f2d1dedad22b0c7574e0 Mon Sep 17 00:00:00 2001 From: Anton Saukkonen <63663359+antsaukk@users.noreply.github.com> Date: Tue, 18 Nov 2025 10:12:02 +0000 Subject: [PATCH 12/21] remove fused_moe_stage1_tkw1 and place aiter.fmoe_g1u1_tkw1 under fused_moe_1stage --- aiter/fused_moe.py | 215 +++++++-------------------------------------- 1 file changed, 34 insertions(+), 181 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 067d4102f6..834316a1ad 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -261,7 +261,7 @@ def fused_moe_( moe_sorting_dispatch_policy, ) - if metadata.run_1stage and not doweight_stage1: + if metadata.run_1stage: return metadata.stage1( hidden_states, w1, @@ -283,26 +283,9 @@ def fused_moe_( a1_scale=a1_scale, a2_scale=a2_scale, num_local_tokens=num_local_tokens, - ) - elif metadata.run_1stage and doweight_stage1: - return metadata.stage1( - hidden_states, - w1, - w2, - topk_ids, - sorted_ids, - sorted_weights, - sorted_expert_ids, - num_valid_ids, - moe_buf, - w1_scale, - w2_scale, - a1_scale, - a2_scale, - a16, - per_tensor_quant_scale, - expert_mask, - activation, + M=M, + device=topk_ids.device, + doweight_stage1=doweight_stage1, ) else: return fused_moe_2stages( @@ -358,6 +341,9 @@ def fused_moe_1stage( a1_scale=None, # [expert(local_expert:EP), 1, model_dim] a2_scale=None, # [expert(local_expert:EP), 1, inter_dim] num_local_tokens: Optional[torch.tensor] = None, + M:int = None, + device=None, + doweight_stage1:bool=None, ): if quant_type == QuantType.No and activation == ActivationType.Silu and not isG1U1: # pure bf16 @@ -372,7 +358,33 @@ def fused_moe_1stage( num_valid_ids, topk, ) + elif quant_type == QuantType.per_Token and doweight_stage1 and isG1U1: + a8_type = w1.dtype + _, model_dim, _ = w2.shape + + a8 = torch.empty((M, model_dim), dtype=a8_type, device=device) + a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) + aiter.dynamic_per_token_scaled_quant(a8, hidden_states, a8_scale) + fmoe_func = aiter.fmoe_g1u1_tkw1 + + fmoe_func( + moe_buf, + a8, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + a8_scale, + w1_scale, + w2_scale, + kernelName, + a2_scale, + activation, + ) else: quant_func = get_quant(quant_type) if hidden_states.dtype != q_dtype_a: @@ -454,156 +466,6 @@ def fused_moe_1stage( return moe_buf -def fused_moe_stage1_tkw1( - hidden_states, - w1, # [expert(local_expert:EP), inter_dim*2, dim] N,K - w2, # [expert(local_expert:EP), dim, inter_dim] - topk_ids, - sorted_ids, - sorted_weights, - sorted_expert_ids, - num_valid_ids, - moe_buf, - # following for int8 quant - fc1_scale=None, # [expert(local_expert:EP), inter_dim, 1] - fc2_scale=None, # [expert(local_expert:EP), model_dim, 1] - fc1_smooth_scale=None, # [expert(local_expert:EP), 1, model_dim] - fc2_smooth_scale=None, # [expert(local_expert:EP), 1, inter_dim] - a16=False, - per_tensor_quant_scale=None, - expert_mask=None, - activation=ActivationType.Silu, - kernelName: str = "", -): - E, model_dim, inter_dim = w2.shape - M, topk = topk_ids.shape - device = topk_ids.device - lastdim_mul = 8 if w1.dtype in {dtypes.i32, torch.uint32} else 1 - - if fc1_scale is None: - # pure bf16 - aiter.fmoe( - moe_buf, - hidden_states, - w1, - w2, - sorted_ids, - sorted_weights, - sorted_expert_ids, - num_valid_ids, - topk, - ) - elif a16: - # a16w8 smooth quant fmoe - if w1.dtype == dtypes.fp8 and inter_dim * 2 == w1.shape[1]: - aiter.fmoe_fp8_g1u1_a16( - moe_buf, - hidden_states, - w1, - w2, - sorted_ids, - sorted_weights, - sorted_expert_ids, - num_valid_ids, - topk, - fc1_scale, - fc2_scale, - fc1_smooth_scale, - fc2_smooth_scale, - ) - elif w1.dtype == dtypes.i8 and inter_dim == w1.shape[1]: - aiter.fmoe_int8_g1u0_a16( - moe_buf, - hidden_states, - w1, - w2, - sorted_ids, - sorted_weights, - sorted_expert_ids, - num_valid_ids, - topk, - fc1_scale, - fc2_scale, - fc1_smooth_scale, - fc2_smooth_scale, - ) - else: - raise ValueError(f"Invalid args: {w1.dtype} {w1.shape=} {w2.shape=}") - - else: - # a8w8 fmoe, opt: smooth quant - a8_type = ( - w1.dtype - if w1.dtype != dtypes.i32 and w1.dtype != torch.uint32 - else dtypes.fp8 - ) - if fc1_smooth_scale is not None: - a8 = torch.empty((topk * M, model_dim), dtype=a8_type, device=device) - a8_scale = torch.empty((topk * M), dtype=dtypes.fp32, device=device) - - # moe_smoothquant_fwd need topk_ids which contains local_expert_id - if expert_mask is not None: - local_expert_hash = expert_mask.cumsum(0, dtype=dtypes.i32) - local_expert_hash[local_expert_hash > 0] -= 1 - topk_ids = local_expert_hash[topk_ids] - - aiter.moe_smoothquant_fwd( - a8, hidden_states, fc1_smooth_scale, topk_ids, a8_scale - ) - else: - if ( - w1.dtype == dtypes.fp8 - or w1.dtype == dtypes.i32 - and w1.dtype == torch.uint32 - ): - a8 = torch.empty((M, model_dim), dtype=a8_type, device=device) - a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) - if per_tensor_quant_scale is None: - aiter.dynamic_per_token_scaled_quant(a8, hidden_states, a8_scale) - else: - aiter.static_per_tensor_quant( - a8, hidden_states, per_tensor_quant_scale - ) - a8_scale.fill_(per_tensor_quant_scale) - elif w1.dtype == dtypes.i8: - a8 = torch.empty((M, model_dim), dtype=w1.dtype, device=device) - a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) - fc1_smooth_scale = torch.ones( - model_dim, dtype=dtypes.fp32, device=device - ) - aiter.smoothquant_fwd(a8, hidden_states, fc1_smooth_scale, a8_scale) - else: - logger.warning("FMOE fall into pure torch quant...") - a8, a8_scale = aiter.pertoken_quant(hidden_states, quant_dtype=w1.dtype) - if w2.shape[2] * 2 * lastdim_mul == w1.shape[1]: - fmoe_func = aiter.fmoe_g1u1_tkw1 - - else: - raise ValueError( - f"Invalid MoE weight: {w1.shape=} {w2.shape=} {lastdim_mul}" - ) - - fmoe_func( - moe_buf, - a8, - w1, - w2, - sorted_ids, - sorted_weights, - sorted_expert_ids, - num_valid_ids, - topk, - a8_scale, - fc1_scale, - fc2_scale, - kernelName, - fc2_smooth_scale, - activation, - ) - # fc2_smooth_scale) - return moe_buf - - @functools.lru_cache(maxsize=1024) def get_block_size_M(token, topk, expert, inter_dim): cu_num = get_cu_num() @@ -830,7 +692,7 @@ def FinalFunc(): logger.info( f"[fused_moe] using {'1stage' if run_1stage else '2stage'} {'default' if cfg is None else tag} for {keys} " ) - if run_1stage and not doweight_stage1: + if run_1stage: logger.info(f"[get_2stage_cfgs] run_1stage") return MOEMetadata( functools.partial( @@ -844,15 +706,6 @@ def FinalFunc(): ksplit, run_1stage, ) - elif run_1stage and doweight_stage1: - logger.info(f"[get_2stage_cfgs] run_1stage and doweight_stage1") - return MOEMetadata( - functools.partial(fused_moe_stage1_tkw1, kernelName=kernelName1), - None, - block_m, - ksplit, - run_1stage, - ) if ( dtype in [dtypes.bf16, dtypes.fp16] and q_type == QuantType.per_1x32 From 67c74ad1bbb2067adb9be0e0653b3605fb0e0ae6 Mon Sep 17 00:00:00 2001 From: Anton Saukkonen <63663359+antsaukk@users.noreply.github.com> Date: Tue, 18 Nov 2025 10:19:05 +0000 Subject: [PATCH 13/21] remove unnecesary arguments --- aiter/fused_moe.py | 8 -------- aiter/fused_moe_bf16_asm.py | 2 -- 2 files changed, 10 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 834316a1ad..3f3868b226 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -104,8 +104,6 @@ def fused_moe( num_local_tokens: Optional[torch.tensor] = None, moe_sorting_dispatch_policy=0, dtype=None, - a16=False, - per_tensor_quant_scale=None, # following for cktile support hidden_pad=0, intermediate_pad=0, @@ -132,8 +130,6 @@ def fused_moe( num_local_tokens=num_local_tokens, moe_sorting_dispatch_policy=moe_sorting_dispatch_policy, dtype=dtype, - a16=a16, - per_tensor_quant_scale=per_tensor_quant_scale, hidden_pad=hidden_pad, intermediate_pad=intermediate_pad, bias1=bias1, @@ -161,8 +157,6 @@ def fused_moe_fake( num_local_tokens: Optional[torch.Tensor] = None, moe_sorting_dispatch_policy: bool = 0, dtype: Optional[torch.dtype] = None, - a16: bool = False, - per_tensor_quant_scale: torch.Tensor = None, ) -> torch.Tensor: device = topk_ids.device M, topk = topk_ids.shape @@ -193,8 +187,6 @@ def fused_moe_( num_local_tokens: Optional[torch.Tensor] = None, moe_sorting_dispatch_policy: bool = 0, dtype: Optional[torch.dtype] = None, - a16: bool = False, - per_tensor_quant_scale: torch.Tensor = None, hidden_pad: int = 0, intermediate_pad: int = 0, bias1: Optional[torch.Tensor] = None, diff --git a/aiter/fused_moe_bf16_asm.py b/aiter/fused_moe_bf16_asm.py index ea61d128dd..87a9ccbc43 100755 --- a/aiter/fused_moe_bf16_asm.py +++ b/aiter/fused_moe_bf16_asm.py @@ -295,8 +295,6 @@ def asm_moe_tkw1( w2_scale=fc2_scale, a1_scale=fc1_smooth_scale, a2_scale=fc2_smooth_scale, - a16=a16, - per_tensor_quant_scale=per_tensor_quant_scale, ) From 1cfe55f07522eead4717ead37c79bca87c46160a Mon Sep 17 00:00:00 2001 From: Anton Saukkonen <63663359+antsaukk@users.noreply.github.com> Date: Tue, 18 Nov 2025 10:29:30 +0000 Subject: [PATCH 14/21] apply black formatting --- aiter/fused_moe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 3f3868b226..a6787879d2 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -333,9 +333,9 @@ def fused_moe_1stage( a1_scale=None, # [expert(local_expert:EP), 1, model_dim] a2_scale=None, # [expert(local_expert:EP), 1, inter_dim] num_local_tokens: Optional[torch.tensor] = None, - M:int = None, + M: int = None, device=None, - doweight_stage1:bool=None, + doweight_stage1: bool = None, ): if quant_type == QuantType.No and activation == ActivationType.Silu and not isG1U1: # pure bf16 @@ -498,7 +498,6 @@ def get_block_size_M(token, topk, expert, inter_dim): (ActivationType.Silu, QuantType.per_1x32, dtypes.bf16, dtypes.fp4x2, dtypes.fp4x2, True) : aiter.fmoe_g1u1, (ActivationType.Silu, QuantType.per_1x128, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_fp8_blockscale_g1u1, (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.bf16, dtypes.bf16, False) : aiter.fmoe, - (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, False) : aiter.fmoe_int8_g1u0_a16, (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_g1u1_tkw1, } } From 4902128e33361963ce34aa6af8b3251c5a6bf66c Mon Sep 17 00:00:00 2001 From: Anton Saukkonen <63663359+antsaukk@users.noreply.github.com> Date: Tue, 18 Nov 2025 10:40:39 +0000 Subject: [PATCH 15/21] simplify aiter.fmoe_g1u1_tkw1 call --- aiter/fused_moe.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index a6787879d2..bbd88b3340 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -358,9 +358,7 @@ def fused_moe_1stage( a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) aiter.dynamic_per_token_scaled_quant(a8, hidden_states, a8_scale) - fmoe_func = aiter.fmoe_g1u1_tkw1 - - fmoe_func( + aiter.fmoe_g1u1_tkw1( moe_buf, a8, w1, From 0a6643558e442135774e90804a2d47cedb3984cb Mon Sep 17 00:00:00 2001 From: antsaukk Date: Tue, 18 Nov 2025 18:37:48 +0000 Subject: [PATCH 16/21] add doweight_stage1 column to fused_moe_1stage_dict map and remove elif condition to select run_1stage=True --- aiter/fused_moe.py | 49 ++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index bbd88b3340..a694a2684d 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -478,25 +478,25 @@ def get_block_size_M(token, topk, expert, inter_dim): fused_moe_1stage_dict = { "gfx942": { - # activation, quant_type, dtype, q_dtype_a, q_dtype_w, isG1U1, API - (ActivationType.Silu, QuantType.No, dtypes.bf16, dtypes.bf16, dtypes.bf16, False) : aiter.fmoe, - (ActivationType.Silu, QuantType.No, dtypes.fp16, dtypes.fp16, dtypes.fp16, False) : aiter.fmoe, - (ActivationType.Gelu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.i4x2, True) : aiter.fmoe_g1u1, - (ActivationType.Silu, QuantType.per_1x32, dtypes.bf16, dtypes.fp4x2, dtypes.fp4x2, True) : aiter.fmoe_g1u1, - (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, True) : aiter.fmoe_g1u1, - (ActivationType.Gelu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, True) : aiter.fmoe_g1u1, - (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_g1u1, - (ActivationType.Gelu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_g1u1, - (ActivationType.Silu, QuantType.per_1x128, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_g1u1, - (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, False) : aiter.fmoe_int8_g1u0, - (ActivationType.Gelu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, False) : aiter.fmoe_int8_g1u0, + # activation, quant_type, dtype, q_dtype_a, q_dtype_w, isG1U1, doweight_stage1, API + (ActivationType.Silu, QuantType.No, dtypes.bf16, dtypes.bf16, dtypes.bf16, False, False) : aiter.fmoe, + (ActivationType.Silu, QuantType.No, dtypes.fp16, dtypes.fp16, dtypes.fp16, False, False) : aiter.fmoe, + (ActivationType.Gelu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.i4x2, True, False) : aiter.fmoe_g1u1, + (ActivationType.Silu, QuantType.per_1x32, dtypes.bf16, dtypes.fp4x2, dtypes.fp4x2, True, False) : aiter.fmoe_g1u1, + (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, True, False) : aiter.fmoe_g1u1, + (ActivationType.Gelu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, True, False) : aiter.fmoe_g1u1, + (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.fp8, True, False) : aiter.fmoe_g1u1, + (ActivationType.Gelu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.fp8, True, False) : aiter.fmoe_g1u1, + (ActivationType.Silu, QuantType.per_1x128, dtypes.bf16, dtypes.fp8, dtypes.fp8, True, False) : aiter.fmoe_g1u1, + (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, False, False) : aiter.fmoe_int8_g1u0, + (ActivationType.Gelu, QuantType.per_Token, dtypes.bf16, dtypes.i8, dtypes.i8, False, False) : aiter.fmoe_int8_g1u0, }, "gfx950": { - (ActivationType.Silu, QuantType.per_1x32, dtypes.bf16, dtypes.fp4x2, dtypes.fp4x2, True) : aiter.fmoe_g1u1, - (ActivationType.Silu, QuantType.per_1x128, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_fp8_blockscale_g1u1, - (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.bf16, dtypes.bf16, False) : aiter.fmoe, - (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.fp8, True) : aiter.fmoe_g1u1_tkw1, + (ActivationType.Silu, QuantType.per_1x32, dtypes.bf16, dtypes.fp4x2, dtypes.fp4x2, True, False) : aiter.fmoe_g1u1, + (ActivationType.Silu, QuantType.per_1x128, dtypes.bf16, dtypes.fp8, dtypes.fp8, True, False) : aiter.fmoe_fp8_blockscale_g1u1, + (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.bf16, dtypes.bf16, False, False) : aiter.fmoe, + (ActivationType.Silu, QuantType.per_Token, dtypes.bf16, dtypes.fp8, dtypes.fp8, True, True) : aiter.fmoe_g1u1_tkw1, } } # fmt: on @@ -643,23 +643,12 @@ def FinalFunc(): ): if q_type == QuantType.per_1x128: run_1stage = True and (inter_dim % 256 == 0) - elif q_type == QuantType.per_Token and q_dtype_w in [dtypes.i8, dtypes.fp8]: + elif q_type == QuantType.per_Token and q_dtype_w == dtypes.i8: run_1stage = token > 32 + elif q_type == QuantType.per_Token and q_dtype_w == dtypes.fp8: + run_1stage = True elif q_type != QuantType.per_1x32: run_1stage = token < 256 - elif ( - doweight_stage1 - and ( - activation, - q_type, - dtype, - q_dtype_a, - q_dtype_w, - use_g1u1, - ) - in fused_moe_1stage_dict[get_gfx()] - ): - run_1stage = True block_m = ( BLOCK_SIZE_M From 4c5ebf6fe71ae668484c9ce170e86af473816c6b Mon Sep 17 00:00:00 2001 From: antsaukk Date: Tue, 18 Nov 2025 19:00:33 +0000 Subject: [PATCH 17/21] add doweight_stage1 to query key --- aiter/fused_moe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index a694a2684d..ff86e775b8 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -630,14 +630,14 @@ def FinalFunc(): kernelName2 = "" run_1stage = False if ( - not doweight_stage1 - and ( + ( activation, q_type, dtype, q_dtype_a, q_dtype_w, use_g1u1, + doweight_stage1, ) in fused_moe_1stage_dict[get_gfx()] ): From 0430e19b6411293e1410b90ac03b23dfcc9da932 Mon Sep 17 00:00:00 2001 From: antsaukk Date: Tue, 18 Nov 2025 19:17:24 +0000 Subject: [PATCH 18/21] modidy elif to select run_stage=True for tokens > 16 --- aiter/fused_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index ff86e775b8..9888b50486 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -646,7 +646,7 @@ def FinalFunc(): elif q_type == QuantType.per_Token and q_dtype_w == dtypes.i8: run_1stage = token > 32 elif q_type == QuantType.per_Token and q_dtype_w == dtypes.fp8: - run_1stage = True + run_1stage = token > 16 elif q_type != QuantType.per_1x32: run_1stage = token < 256 From 5145cdcad95c51eb1f23d0d42c695b1d64805cfb Mon Sep 17 00:00:00 2001 From: antsaukk Date: Tue, 18 Nov 2025 19:21:14 +0000 Subject: [PATCH 19/21] apply black formatting --- aiter/fused_moe.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 9888b50486..212a06b4f1 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -630,17 +630,14 @@ def FinalFunc(): kernelName2 = "" run_1stage = False if ( - ( - activation, - q_type, - dtype, - q_dtype_a, - q_dtype_w, - use_g1u1, - doweight_stage1, - ) - in fused_moe_1stage_dict[get_gfx()] - ): + activation, + q_type, + dtype, + q_dtype_a, + q_dtype_w, + use_g1u1, + doweight_stage1, + ) in fused_moe_1stage_dict[get_gfx()]: if q_type == QuantType.per_1x128: run_1stage = True and (inter_dim % 256 == 0) elif q_type == QuantType.per_Token and q_dtype_w == dtypes.i8: From e2df6c64fa288f9e36d1f46ae75d92ea8d7c9abf Mon Sep 17 00:00:00 2001 From: Anton Saukkonen <63663359+antsaukk@users.noreply.github.com> Date: Wed, 19 Nov 2025 09:19:41 +0000 Subject: [PATCH 20/21] removing csv and .co files as they will come in separate commit --- aiter/configs/tuned_fmoe.csv | 1 - .../fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv | 1 - ...oe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co | Bin 24032 -> 0 bytes .../fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv | 1 - ...oe_bf16_pertokenInt8_g1u1_tkw1_silu_32x64.co | Bin 23776 -> 0 bytes 5 files changed, 3 deletions(-) delete mode 100755 hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co delete mode 100755 hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x64.co diff --git a/aiter/configs/tuned_fmoe.csv b/aiter/configs/tuned_fmoe.csv index f6f2dbe25c..bca541c83e 100644 --- a/aiter/configs/tuned_fmoe.csv +++ b/aiter/configs/tuned_fmoe.csv @@ -772,4 +772,3 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,536.7655,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,536.7655,1,47.26,2646.03 80,512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,560.4425,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,560.4425,1,90.53,2544.06 80,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,827.4898,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,827.4898,1,122.62,1736.35 -256,16,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter43fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv index ae9c6e7abc..1e62666e2c 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv @@ -13,4 +13,3 @@ _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x256E,fmoe_bf16_pertoke _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x512E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x512.co,0,1,0,1,0,32,512 _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x384E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x384.co,0,1,0,1,0,32,384 _ZN5aiter53fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x448E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x448.co,0,1,0,1,1,32,448 -_ZN5aiter43fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co,0,0,0,1,0,32,64 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_32x64.co deleted file mode 100755 index 74cfb603152492d72ae86322897954ff4befc7ec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24032 zcmeHPdvH_NnLn0fS?Eae16wk05$ps%c^V86K?WO4z+mGyj|kb8EeG2&w#=gm=317` z%RJ%`LX$M1JQ6|@LK8yB%U05~-I;|Jnr%yV(rspTJG+0hvzh63AKq^6e&0FwDp!gq ziNx-7rWSMkyWe-d-*?YF_tQD|>W8$iWZf!NT%27)_)ASbCvnm~f?e9*wu$d-h(})` z(ZOF6F%T`dk!$D8U&DZuOh%m)hi>_pk}DLZNT!TQ+(sUBfe_iB!~I0U+YfSEMxx+h zUvJk)?K?xUlbdAr1ZhX&slfauJee<_-^Sm1oACU;>G9+-uK*q%@7uocSZqA8!}tor z3l{E{Q6BpQaG!uSD_~Js@#^wwLVTB;?M?1(Xa06qSBI;u`-!3R`OdDE?$1d1;H7+L zqqoP|=5RJPy1KhBcDbBQj$K`?UKg}9HFbf{(cS56biLTp=4o)YozL%Zw!6N+;iY^} zb91-LOU~zaw|>W^dnJFTv#rPF*xuUF^coL!H1sq#ySfHAZ5aJ;*A^7?@Ytb5+es&cs= zuOE+ayu0`ER0P&N?&X=v+F&P7M>yQwJ9!}j>mGOVY-Pz;LwPyE@$TNsYY|v??4`Vp zHnp|~2PeQ2Xyi2$SKqrPP%OmPKaMrQ-^7~WdK3xHxs`)(Gl~q#yOHaE7i)ssu_pK| ziUj8yoZd!vuoOOzB1On3{8Ow6{w3A~zl$QlIagA^2GAQQ88*!;IXGo7Z!_+7;oGDfe zgEfi-q2-VfbB5V5XUL5r!;iet`D z8gqtq(PVHb-xuZ4WC)oas$$Mi7juR!QDis|1yP}RYix@mM`%Sf#hk$%bB4AkGMsZ2 zD+i)8iWH#(;f*=N?wB(?9z}-pE_ixXl!OpPj?j_t$DH9n%o&EF$Z*c<>D>O+&D8NI zQiKk~iI_8-iaEn`-zLL5`7K?Zo=!)%tEJu5;dL~(?shdf@I&68qLOfxeTobp9|r@R zZLKXGnzz2@N6L-3)7SFbox2^)ZBDOam#1sHl$^B@{?YKKiyfQR7dYWF=dOhdn%g}t zM?>?xMUGBam)Eo1)v>0-Tj*$+*E7%I-M(v{qs7(M<5)0%_o9U*@8x%PweEC!UBSM5 z)md^Szst1)KFba{=96%L92fUWes@b}m!r|s)8XCrQhxWI_6ASer%R*kMgI0Cc>WG1 zyrCkm=kIdvbTxN*VEAwp?D7ZjJ6&Dft)7lySsmb?;Vb`QONtsk#g}yusc{KlGAI{; z$=qR={1`89tjUUsMLF{=Dm4^TUd_Cv1`lrdl4S&LaW0AS#G~cGfXRoCfT3e!mvoy5 zfn+&2m#YoL4<{$(vHUZ;pxw(Rnm*+c~%JcjZFbM|T zoCY-lSOYj0@Fbue@Rxw)fMzq)2w)9hCm{CKN2)S5iW^}ZM6waCV>(%QD|-(im0~6H zRf2DlN0X` zYux>CJ;kazgAOK3w0$+23{?QGTZwHrzI55}GPsVm^mIX(G|886ocsO6o?)}w-2sS6 zC!dW>BEKG)O#WeH3i-`QNfE?s*&+yAxB67j4=7aufqAcY-iqs^qH0|)WEDl#dc=LAs=Qa+ z2X6*4G%6Yl_v8COG?^Is42BG!$&^8o%0Jf{69mKr6Gwfbj@dLxC;0LDNautx`Yfp} zo1-pS&tfN}AXa>yPU4}49vAR>pgl>p;r!5^EZZ<&qdi5oVVu#PD%8R6xU^uvDr|K}GfnDxp_U zsQ`U|8bCjw7H|+y2Y3Kb4|oXB05}9_0zBG_^>e05Yv46iW5`L+y77H<0J>$3X^19px>0>|S4jwgtQwV$$c zd`{qaI>7NqLvNmiQ>RcUGy8;_)?1++DjxL-RB{nE99u_MZB zg*J2u<8?E6ygn1+aXeFb9zz!QoAP)bc1@9|yryWwHT7eE=1nvqstP8W*!4(%T0mQ~ zg!{G2z%O5WN3D}TGZLNodl39F|Hsh+CC>Zx9; zr>%;5y2E>4fIjP-Qa!oC>Zw(#r*=g>VIE~Yp)J?b4ym4c!s=;{R8M;q^@KcSJ)tev zQ=e2%17Y=aP^zaRih4qxvYw#6Lh7kPs;BC(da9S|X)E~odph`DXZ7Ti>d6&WPpwit zwL=|pUT{vp>z}kks;8c?dfFq^(_ZlNIU+cBuzKo~>S-XXo(@X&bOh><^MZ4XvVNke zr#PsmUw!Et{g9j+`H-9+5#rt-*|nf`aX57-WOl5?@!pQ&nAC+EBjfE zCI>JFFc&ZvFb^;fFdO1?AU+r3^H_ZRt!${y91?#k2QU{f7cdVn4=}r^syC;orZ=~! zt~ak}Q!mb!_i6ftY!Y`X`?qnxi@TKrm}pt;&UNBkHvrc^T{xL z`}6Tv>*zqOo(|R<=>E+n>OTnfVX%*YeauAv67KEso-Lkg1W`4+_k+cQ1)?gOVXFF* zTY|b$gXl}B)T9CWX)VLS#99s9=Vyosa9<7g0{HAkik`Whoywo9@IC|Yd-~zIjuidw zb~Yi~1da`41+$w3j?H8xv)uy6M@ccW+XRjsq=ea>0>>`0irHR)<4&@g*}Daf-yx;U zeq7+lp5^fzw*6=$>$~$OHSC<4ZgAW6Gj6+o!R^*xal7r;+;0C3w>y8!?H&KjZSU{7 z-Se;9-u*{z@A(gIKmI@5-YbG#GP^geB)eBsRl$9C=gRD?+4}69Y<()sPpL3BrNX?F z3UiV`&iq_ZtBGJg^K)N&TkPx3dMf87%|s`|7&kee0m~T&Lud zHEdi3fuoOKBI4 zKS8ZWR2dh5A8OK|5srhcfp|o1{7M#oUuwT_0{sw=s5MraP66s54pAT9$l_+D_Y2RV zAL0=8#%|MTKm)`fn&SIeT*lOX;T-xQ4$)+M(xhC^3WdUf^}_mr4Z?I&mIg^@a&7$VaOlp4M0Xk-#Nof`2yKfey)qok9l-2;={{W@6kUuMn zT5@uzB`=R!=FFjEv$J@7b`Fov&ExU;b6C82-aKkvuz;Ei3aEMUVmdZ=9;J1VZ~g*G z>j4)QP`U|l(PBzBL;fX8sO7;2sb$$RYFWOVjukH9@k<}%@wR0={-Nb8-dt2f%_~<@ zb4dv`uU<{Z>_wDrfqW}gQo0qexP;P209UQ1)B*X|tf7{*YpJEIj9NBqpkt+Lc>Kd_ zdHlLE9>0DAi=RtMZyrcilT0GsI#8ii!8~&-bHhgj{Rby<|3UEQNd5!l_qTGYf6m$u zrSbMdVZ6f@?mw*H9aQj!Ch_*6Fy4_V+1_fTf*C)3*&w7 zLGJ⁡X(-J->{%KOe?>VLA7|px`~J;GHhw?We9?!-r3dM ze^$YJO2Ip~hPR&!%dF!&z@JOMZac=D??hg7uPhDSF$X<%4n<@O3Iw^!P^U2Nxe z$u@4U+Q#kGq?_?diJ##bE4LrEa(k_v+w1JyF5AZK_1n0;0Y3{)jr($h2&KO+x9dm- z>@&L$>>_TPrS^?06gISPT*>WpslBRLVMBXW3AZOp?KP_uHni8Q=Jr&ny{=SYLwnsC zZcmrmH$AMdp?%X@Zd;}HE$b9Ev~MZn_I*#1e)W@_2Gm0BDQI#yTD z<2P;Q@msd?_(vQp-t2Tzb7LblyIj=V(n80!IjI5H?uJHcz_q){MGd%iH@8p&uHCJz z)Utg$wY0ZWi^oI9+^szR(d|6Gt)0hrcv!r7#|~=l?xyCR9%|mTi;i{fpaxvKySk|X z*KThQHQ?I4a~Cz>+P!BFwLJD1wd~zXE&KM-vE6%k{C6JX@sIE2@xA+4yt%KBn)~~y zd0>E=_wT1;LLW8Y+U@J723))S1JrP{HSx3t@B4%az>!vSQA;pqQ&JR`d3YVZ2vrx&IXf?_~w=)q39k zY8dae&D{T*f_G8DyR?K<{(czmgU7i40|oC51#e_8ZyyQcjqc|5Bs_QLj~`Og7?t?Z~rKa_rv|%|3d}uEd}r6gS`FYFy4<2bN`PN zysHY{kB{*7ABXXNa*X?bqTszP@z~mZL0Y>nN^AG4(%OAVTDvbxYxf({+I>Y@yRVL~ z-2~RoztpuGKTnOkm6{Ri{hN%z4R)P_Pn zuLu2yjS*;tc4$W(2od$4NV*T|LD-J^k5czR9SHrX|Agy4=tn)M7!dUzrS5||5c*O7 ziKP3W9)#_v|0s1I)Pc~C`cJs-gMQS5l=n!{d|n?({|Tul(18RvmIm~nU_GHeqSO)5 zmUKid=tCHf`cFtbfewUz1L!}&dO}_V-3Q}Lpbz;03Fto|_0%KP6X-v|dcr(+(0#O^ z4`Ci1=szL#1UeA<4WR!7>k0FO={^`^0&OUyo=2g<`cEX?#|(N9wx@&s z6H-s01EGH^=s)4Q5BjHr9#jm7`cFtbfewWJRM3AS={{!AgRngv^q-J=0v!nbQ$ha; z*L~1G9rU1B>uFs7!TafP{byYN`TzgFUGnq#xcE`VYpT?j!3!chY|_4t1Z?uzsNqbPjM_{~6bR@H44#{fB>MFs}c6 zM8@@>(wpP@57$Tj+Vr2!|BkHxu(ca>A6W;I^&j~@Qr3ZF{fDjHp!>)=(4F)jwswQ= zBkMqS(tp_64Z2SNpFKeb`WSFr{~6bRaP1z~fB4!xuK#>Q#`T}lo8$V=U+3E0_g}L9 zga7xkZxp{8g|7>fI8qKIT|)dA9A68i??>yoNxTyN@k`NzyI)9r_`f>rl`6$cW)Rl3 zcEGQtF15mMs9M*pE4I#`n-Bl2v*)yVI$B&^j!vi7jlYEI==M0gZuo9{S7&>-qtV&n z=i<7oDDIh<{64o|nEv&+-m+UDw>)7{lL zr>$*gdmeZmbv1gKVQpSGw{UUuqK1X@3!CS<7Bv?vE^swC8w;I<&c*W=HaQ#S&0o+o zcabYA*zK?Fpj-jp?jtvwLz@he|Fd7!fcZT}w!Ho1h`6s|1>3i}dzr*8ok^H7T9q@I3lHbzN zlkawRyNUD{{oXE;-{om?dYvTS>)P#Q(a!ePMsT~@x;$`6o0{N)lKe(bJN`BT!JiKB zIvW7pz~*}#Nt?65)#jL6vTC8cTex_!ygP4IaCgD{xw3cRk^*_RaB+~fq$n7$VqQVC z-?@*Lhu_tA6xXjWDqF)ZUFc=_))v+yN=+b}@t3?Kq5ONsa^MQ<| zQXtL;cr2Y{%kxAc*#9ny9+@xC2Qp?z@p8M&m)r!W56msM%kzhfRw+*E9PlN-FXi9G#_y%$OP-JA@q@1dv)4oL7kx6nbQkxRO7Ht?GJ(6lcK&7H z$1cs4+GUXWugeYzIz_@9Mr!?>^~=w6A3SYE4p-Q%CqiOFkz_VjsaN_P5vZolVK;D{`8V+vO!OcthssoX{$41o~EKZpB?z#GfBtsqhHaI80~ zM7%o^JGe<^8^w;`X~6t8JcX}3zsdfcpf*?0`iZir19E`~+Tgw>5gZ-1$2^o$a31u15#Y<-0qZyFMdj1Fz=08~okw zR+qb>!PC`sq0{4TbnWhJ@p~ZB*w_g^S67F-!Sj4`tFPYOdM>}+-R602(+l~&rlu~B zpPb9@YWa@G@KXLRcWbxDwWFoI@fDuxs_$-U@^tod%9###qe3#oAf2z@>tZF8O8&ha zo--|r7Df;8MK(lVd+XkB9AT|;u+$g_yX6S8Vvuh42-hehw8uEuEk`&z2I+Q>uu2)> z{1^wj

L7kZ$(~iNV;t<3BP@+Uy3HdzU0bdUu`I^XZaKu-7^K@h#5Lg|ZjW)a zTMp41gLJ!xSR5W=M~tK0a)^6kkZ$u3&s1y(kMW8W<9N3ms3#}wKyuM??$ftBGCdj5-spq90ktR zyZsH`a4UQsM~R48_}4@W{ClDW{t!ojv!0Gle|QqUh@(QpB>ZQh1^$v~fxpI4;B;-5 zG6h$YNE`*!b8t0Dmv99`!WB&MR9I7`nupYQDyXiFSCcXlu8@^*g(-1VI9;ro274R@ zBHJN5;R>@8t}r)_3a2-zreS^@1tOOr+Jq}?jibUjXozyvOJjQ+H6lBrG2sf{ge$bhQQ@qoSUnLP zag>Oh2!FyA_9R^4;W#Rs^T4}TRZB>5)QFskV8Rs+BwS%IjtXb}zK$JV-Ao;iqeSFH zoJhFBlL=RN`fsW5PJVNzue-z5G>n#Xd!bkB`F%?$(y( zcHNs_^CRVI(y7b&ZSFm;rdGG#wcFRZLlkE*!ao}Rbct)r#sW8d=G?h>QB#}Gju@HCS>f zztgi5KFf|c=3{VwoRsuZephoxr>nu&-R|H1LVnlYwt8Rdr_16TMgERPc>f(PcvVAQ z%irzZLyT72z8iaNkQ!&m>sNV*n2#aDC?5nKwG2FgWf5_dR- zAM=$BL+CdWJM;aR9_#58c6yk=m}K(2z#)wn zaZHpaGBnF(hBRIAhBM?D3}?x+7^cZ-41vEN)iU5S1U^IHGXy?E;4=h1L*O%< z?)>xuT9>T}!Felj3?-K?A6gFQ(UFxUNfRdqQjT+fkT|m)PNz2nIa%bh z;fdro!;{Fr3{NJ%9WE(?oUL0WY1_7d2F3xcN|M)#Iwq4Ona#Y;YUn4jGTX6or8flo zj+Mn;NAYT}V|A%_HS*YN@2A7a-zdTBb@^pU=|RWr*^-o#6X4G!H9M4=?)|3cm&=}4 zMz)+`^bG$hgo2DA1Af6xboG{0jBVxH6 z4QWP}J1!lu{PQf53_bL)gy#e8iHZ%+5A8{c4cBY5Co4A0Gul%W8~@`2*i#i7|Dyod z(-b>}+tb087CkI2&B&0X%*+5u-TJwm3`iscPzRU=XaJl7Xc~}~iQbenWS^vwdL)eo zFaW3n3*;&?VCpc~n)4~}%eImUVt`O_yPO>05aWNIyJK2o2M(AG`le*IMRi*p5YN7UyE zZ5R;d8)oo)<1EO>`OM&TOgY?d$>Vj{IYplOoT3fqG>GF_Fu{VTDVShk=OgP032og{ z?$<8|zjE%8r#|;+!@0+NLlMt6mOwt9&ndi)X)X6#Ht;&EN06uP5$MSdG3QRGm71nC zBc^9+%}URpoiUO7ty8sT$8#A7UPAS z96y%J9PAuA%8esSrKSC9dzrMXUwN%4uZ1n*`b(0^-YzTGjXX}G`l(#>Q&m(y)ro%E zrs}7gy!Is+v%xL;$rII2Eux>=RQ-f?)cu6E(oZ`@KXpg-(_Ya}kEr?ydFp;bTj{4> z(NFzR{d7?D(-BoaAy3^;&|eY#R4)3dDypCAL_ci|=VfajcEyGj& zbqr7Uo0F1g8f<2*mZrgGPEMw2u$fa*Xc}yzB#DkMOOi)5S~|WQP?t={ZwAz-(D4;X zvSr*jnhu+VQko8%#cZbOuvx~Br|Gbb8pd(Hk#fGt%=wn_jBoChjA^i$`vS%^*v$Pw zV;XGc{R74{*hVGEn1ja;1dKU&{9w?SgU25jFy`R#4<0b4!)7^r$e0eB<;b8h9X89c zqsDaDMh_g|{6mL0e{hiVj~-=w)_btl%^-bJE=j&FnTdT4U@zcYzyM$#U>{&^QDx7Z zqUxTxMYTP7MO%9CdUuBoc}drEf1dA6zk43@bLu2c>2Y72E=&w-()=5roVkW!72mouQAer zIuqT0zl8=5f_)h5BVZr1(7%U!YrJPlraDR1%>Bfc8A2VldNX8U*fontYP*ZiQ{)jDYGAzII{QB_#C$X zXfqqT`zO`xIW=D8w&!Qu_WqLFEx+b=>uLa5FZ$?B^f1ip>BRpKitRl!+W7;Va|h< zwRU{pgY(}D^UvQah;s$^QMuxM6=mgBt*QzdCPt-#8%NSE>1AEt^+K;us(s zm>rZj4it*_LYu^U34HI3_w_qB@%z3Gzyq)@kR+`UQIkBA-TTc*?UM%4KTc~z)R-56 zA9~WHla7O}gM37N@+y{pM@FA?0{xJWs5e(wo&+>N9-=Y1f#uE2>XV*EKja}A&0UsL zfF{U8v?TYjyzD7`(pmIF9-_tkm_@yw6iS8t8>Nl?o1{(s(a+JEmPV};CQxf;CbimZ zbTl=MQoN?7PoNa9sTpwg@S1A1Q3}^o+r){~HhD6&O`S?@)2GwXtcg5-(qx`LWh&2~ zHl5{LXUw41nKP+%)+}nh>n=KKpFwFQ)VpIQrB#60vnZ_wyz?$fYoLBk4zYe50EE?GiH=P#hN7V0frL}?x1;sQ#y z0N%ZX()*$Q(xudP&pp((d^xqPSV2b%m-75&_wanja-M(h3YKpzDx%g^tEjc4gj(0E zp`*?sO1DD2m8&S-23TA|=>veP*HG$$`fJxx+q!kswqXOcZQ4XfOV{%J`_}RN^&5Ep z#!W1LJ}JF+AWchV5&8Ooa=iwwGuLNr`iP+a-~{eJ2>v<3e}Me)`kbntv-qJ*9zPVt zJ8a|r!z$iE6>o4Nj}J!ij!fqMBP!k@74PU&9zPnzJ2svBkEwWvRlMUfc>H)2@6nmu z|EP*LsNxOH;_;y<-eY%h|6?lN5f$%54v(LR;ypfx`yW^Fj;eSk^LYGZ6z_?7-2a4% zcTB~5asiJ&8O8hVBJTgLig#Sad#ZrPpNisrZwdE*PsMvw#d~@wk3Sv7`~E%L|9us2 zNX2_*IgdXR#d~%I_dl!RJ*MKFD&p}|QM~6?asP8F-U$`&bP122j^dqJ!~JJeyvJ3% zvuk<$Y!vU@I_^KG;+<6Sp5MUZ&qwiI*u?!WsCZ8Z9w~q7h890i=mI+a=q%y?Q&h z*N`s8DW{ zSRUS^zmI;%L!4y(lO+T=1@aK5C2Pmyy*%ENe~f;}L!4%wF#cPw-R0%fT2Vo*RaMkl zQ$t6~%Bcz0?#&g{gll(Y6*b}7U0p*>xOUgoQQQ6ZQ`@#})aG*0(b_tmzvX_OzjYhW zf564^t!_89HZ)MH$3v~n&2)6To0@R#u5X|wT)P`R)P!qyQ!_Q;+TGGZZ98^QTU#5o z`FwQL+rsl7+`;o(+jxGvkL6o;?xfbPE^6)Wrq1fAJYQnX^+d~gg+ar%q+rE8tbkAO%|DA_;{=<*({GNR*-`d+tt$lse+TTyD`}fmP zsh65??GE%&6RzFCerm$Cdtg5`;o5!hAhjJnOl?PwP}{L%bo9VMo`2{t&mTO(^N${5 z`M5@(FXz{m^HJBF7c02`Mb$Ouyy{wYp^C>ZMDbp#;r^FYycboxm+N@^x_rI;;y{6(_-pS*aqj>LhasN9i-bEGf z-EJO#H;VV(Ztj0i#ruJZ_x@fUe?N-%!9(2tfr|IKiZ}cSj}J%jM)q<4h>G`yiWlnT z@lX`+!#?i+P{q5X;(gT5;~z!wez>3ef2iWUsp5TnkjFoc;{E6__y0)6drQUp@ev;X zaTM<-$GHC|D&E_I$JXxiV(q>l*6x?Z+Wo3nyDy5h`*pE)UlMEgTVrcCfwlARb?wH_ zGiUrY0D6nWwr%>r*Yp>rDG1#Kbr=cOPt;$&jqZZ_4CbT$qSjqdhe1E;FIeZBbQjD+ zeI^Kq`iok3K^+GDsK0z0-39d-%t!r2t-GKOgMQRsu+GhN7tBFzCgSsQ&|labfmY{) zIO;HnsK3P0T~Lp~IO;EI-34_R^rQX~t-GKf^_XHn)L+!P3+gcFNBt$1?t*#@#!-J! z>n^Cnpda;@Xx#<dHhJeK|v(NCbmNN_A2=r7@ZLVZH5&lW0GU z=`VOcJ*K~)9yF%EjOj0^*No{ek^0T;)L(ENLfu8tVHEvE(OncBM$um|A9WW+hq;;l zf_bRBC_2o|^cT!S-Q^UlAE?8e1su~~#`G8bOlVAh;qTeU^p}swnEq0FZA^dR`ouq) z{&M_(W9u($?FQXN(P0$*MbTXp9Y)b#*xC)ci=xBaOn+f(H|Q>k4s$d8g{|G7yM*w) zALuY21CHr0WBLoO-DCO-U%SWjmygJp{!)5vOn>>uT)Ut9A4Pw`|NGcGf?o~7$N7mI zDF>1z5I=^-$71pQU?Vq)U*Mm=lzq7Sh2X>gm0^zzkuS_JtZ!+DUkhDkhu;vjuU}tm zUpPM>{@G{GYxT7^dpcbmZoe0Q3Dni)bNRjS-R{ngwk}tLyWQ2*-O=Ie^g~9&4tH}) zdo#p6u0~INceAUdy~*cl@^!l0t*tI!m#d@G*VNML>6+Ko*)XrQbyr&+cpmgL_?cmE zT0FmSNz>i+ix(C)&G+2hRIsGLQ}1plbQijpEL_~^u3xZlQRDo(JvrfFe{Bcl0{C_t zx#o!snS_7q`ba1%ZR-6Y@d+2d!~?zWZ&aC=%ieQ-(}8{veK{03he{x$%?ug?43^?+Vr^F5BF)m`st zbG5enhDUWGntqb_6QF=c|O3*QsJ77hn4dO}QQ@SSk|ndVt3=2wS;Mj0gL_ zWZ9?imFt0m*&<(wD}3Q5c=^EEN?f^qC}uLCR7R2vQhks{U;bF4sQ2_hM#pi$z<#nn2 zE;N2G8z0K`SeZZgDlB{43xCn4@Jn}bf2nxgUmnM8Xq#9N#VyK~RL>k@$N5JODmr>}q~iEw_~Cuf&w&Dz+DPV(||M{x6k8fzkZ` E2V(Bb%7 From efc4dcf2b11faaec5704870b7b8d016bf87de747 Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Wed, 19 Nov 2025 20:11:40 +0000 Subject: [PATCH 21/21] removing log logger.info(f[get_2stage_cfgs] run_1stage) --- aiter/fused_moe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 242a5ded9e..abab9cfc48 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -668,7 +668,6 @@ def FinalFunc(): f"[fused_moe] using {'1stage' if run_1stage else '2stage'} {'default' if cfg is None else tag} for {keys} " ) if run_1stage: - logger.info(f"[get_2stage_cfgs] run_1stage") return MOEMetadata( functools.partial( fused_moe_1stage,